mirror of https://github.com/coqui-ai/TTS.git
Merge remote-tracking branch 'TTS/dev' into dev
This commit is contained in:
commit
8b4eb256f6
|
@ -0,0 +1,18 @@
|
|||
---
|
||||
name: 'Contribution Guideline '
|
||||
about: Refer to Contirbution Guideline
|
||||
title: ''
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
### Contribution Guideline
|
||||
|
||||
Please send your PRs to `dev` branch if it is not directly related to a specific branch.
|
||||
Before making a Pull Request, check your changes for basic mistakes and style problems by using a linter.
|
||||
We have cardboardlinter setup in this repository, so for example, if you've made some changes and would like to run the linter on just the changed code, you can use the follow command:
|
||||
|
||||
```bash
|
||||
pip install pylint cardboardlint
|
||||
cardboardlinter --refspec master
|
||||
```
|
|
@ -128,3 +128,4 @@ tests/outputs/*
|
|||
TODO.txt
|
||||
.vscode/*
|
||||
data/*
|
||||
notebooks/data/*
|
||||
|
|
12
.travis.yml
12
.travis.yml
|
@ -6,6 +6,8 @@ git:
|
|||
before_install:
|
||||
- sudo apt-get update
|
||||
- sudo apt-get -y install espeak
|
||||
- python -m pip install --upgrade pip
|
||||
- pip install six==1.12.0
|
||||
|
||||
matrix:
|
||||
include:
|
||||
|
@ -15,7 +17,15 @@ matrix:
|
|||
env: TEST_SUITE="lint"
|
||||
- name: "Unit tests"
|
||||
python: "3.6"
|
||||
install: pip install --quiet -r requirements_tests.txt
|
||||
install:
|
||||
- python setup.py egg_info
|
||||
- pip install -e .
|
||||
env: TEST_SUITE="unittest"
|
||||
- name: "Unit tests"
|
||||
python: "3.6"
|
||||
install:
|
||||
- python setup.py egg_info
|
||||
- pip install -e .
|
||||
env: TEST_SUITE="testscripts"
|
||||
|
||||
script: ./.travis/script
|
||||
|
|
|
@ -10,13 +10,12 @@ if [[ ( "$TRAVIS_PULL_REQUEST" != "false" ) && ( "$TEST_SUITE" == "lint" ) ]]; t
|
|||
fi
|
||||
|
||||
if [[ "$TEST_SUITE" == "unittest" ]]; then
|
||||
# Run tests on all pushes
|
||||
pushd tts_namespace
|
||||
nosetests TTS.speaker_encoder.tests --nocapture
|
||||
nosetests TTS.vocoder.tests --nocapture
|
||||
nosetests TTS.tests --nocapture
|
||||
nosetests TTS.tf.tests --nocapture
|
||||
popd
|
||||
# Test server package
|
||||
nosetests tests --nocapture
|
||||
./tests/test_server_package.sh
|
||||
fi
|
||||
|
||||
if [[ "$TEST_SUITE" == "testscripts" ]]; then
|
||||
# test model training scripts
|
||||
./tests/test_tts_train.sh
|
||||
./tests/test_vocoder_train.sh
|
||||
fi
|
||||
|
|
64
README.md
64
README.md
|
@ -1,12 +1,17 @@
|
|||
<p align="center"><img src="https://user-images.githubusercontent.com/1402048/52643646-c2102980-2edd-11e9-8c37-b72f3c89a640.png" data-canonical-src="
|
||||
" width="320" height="95" /></p>
|
||||
|
||||
<br/>
|
||||
|
||||
<p align='center'>
|
||||
<img src="https://travis-ci.org/mozilla/TTS.svg?branch=dev"/>
|
||||
<a href='https://discourse.mozilla.org/c/tts'><img src="https://img.shields.io/badge/discourse-online-green.svg"/></a>
|
||||
<a href='https://opensource.org/licenses/MPL-2.0'> <img src="https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg"/></a>
|
||||
</p>
|
||||
|
||||
This project is a part of [Mozilla Common Voice](https://voice.mozilla.org/en).
|
||||
<br/>
|
||||
|
||||
This project is a part of [Mozilla Common Voice](https://voice.mozilla.org/en).
|
||||
|
||||
Mozilla TTS aims a deep learning based Text2Speech engine, low in cost and high in quality.
|
||||
|
||||
|
@ -79,32 +84,32 @@ Or you can use ```requirements.txt``` to install the requirements only.
|
|||
|
||||
### Directory Structure
|
||||
```
|
||||
|- TTS/
|
||||
| |- train.py (train your TTS model.)
|
||||
| |- distribute.py (train your TTS model using Multiple GPUs)
|
||||
| |- config.json (TTS model configuration file)
|
||||
| |- tf/ (Tensorflow 2 utilities and model implementations)
|
||||
| |- layers/ (model layer definitions)
|
||||
| |- models/ (model definitions)
|
||||
| |- notebooks/ (Jupyter Notebooks for model evaluation and parameter selection)
|
||||
| |- data_analysis/ (TTS Dataset analysis tools and notebooks.)
|
||||
| |- utils/ (TTS utilities -io, visualization, data processing etc.-)
|
||||
| |- speaker_encoder/ (Speaker Encoder implementation with the same folder structure.)
|
||||
| |- vocoder/ (Vocoder implementations with the same folder structure.)
|
||||
|- notebooks/ (Jupyter Notebooks for model evaluation, parameter selection and data analysis.)
|
||||
|- utils/ (common utilities.)
|
||||
|- TTS
|
||||
|- bin/ (folder for all the executables.)
|
||||
|- train*.py (train your target model.)
|
||||
|- distribute.py (train your TTS model using Multiple GPUs.)
|
||||
|- compute_statistics.py (compute dataset statistics for normalization.)
|
||||
|- convert*.py (convert target torch model to TF.)
|
||||
|- tts/ (text to speech models)
|
||||
|- layers/ (model layer definitions)
|
||||
|- models/ (model definitions)
|
||||
|- tf/ (Tensorflow 2 utilities and model implementations)
|
||||
|- utils/ (model specific utilities.)
|
||||
|- speaker_encoder/ (Speaker Encoder models.)
|
||||
|- (same)
|
||||
|- vocoder/ (Vocoder models.)
|
||||
|- (same)
|
||||
```
|
||||
|
||||
### Docker
|
||||
A barebone `Dockerfile` exists at the root of the project, which should let you quickly setup the environment. By default, it will start the server and let you query it. Make sure to use `nvidia-docker` to use your GPUs. Make sure you follow the instructions in the [`server README`](server/README.md) before you build your image so that the server can find the model within the image.
|
||||
A docker image is created by [@synesthesiam](https://github.com/synesthesiam) and shared in a separate [repository](https://github.com/synesthesiam/docker-mozillatts) with the latest LJSpeech models.
|
||||
|
||||
```
|
||||
docker build -t mozilla-tts .
|
||||
nvidia-docker run -it --rm -p 5002:5002 mozilla-tts
|
||||
```
|
||||
|
||||
## Checkpoints and Audio Samples
|
||||
## Release Models
|
||||
Please visit [our wiki.](https://github.com/mozilla/TTS/wiki/Released-Models)
|
||||
|
||||
## Example Model Outputs
|
||||
## Sample Model Output
|
||||
Below you see Tacotron model state after 16K iterations with batch-size 32 with LJSpeech dataset.
|
||||
|
||||
> "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning."
|
||||
|
@ -116,8 +121,8 @@ Audio examples: [soundcloud](https://soundcloud.com/user-565970875/pocket-articl
|
|||
## [Mozilla TTS Tutorials and Notebooks](https://github.com/mozilla/TTS/wiki/TTS-Notebooks-and-Tutorials)
|
||||
|
||||
## Datasets and Data-Loading
|
||||
TTS provides a generic dataloader easy to use for your custom dataset.
|
||||
You just need to write a simple function to format the dataset. Check ```datasets/preprocess.py``` to see some examples.
|
||||
TTS provides a generic dataloader easy to use for your custom dataset.
|
||||
You just need to write a simple function to format the dataset. Check ```datasets/preprocess.py``` to see some examples.
|
||||
After that, you need to set ```dataset``` fields in ```config.json```.
|
||||
|
||||
Some of the public datasets that we successfully applied TTS:
|
||||
|
@ -142,15 +147,19 @@ tail -n 1100 metadata_shuf.csv > metadata_val.csv
|
|||
|
||||
To train a new model, you need to define your own ```config.json``` file (check the example) and call with the command below. You also set the model architecture in ```config.json```.
|
||||
|
||||
```train.py --config_path config.json```
|
||||
```python TTS/bin/train.py --config_path TTS/tts/configs/config.json```
|
||||
|
||||
To fine-tune a model, use ```--restore_path```.
|
||||
|
||||
```train.py --config_path config.json --restore_path /path/to/your/model.pth.tar```
|
||||
```python TTS/bin/train.py --config_path TTS/tts/configs/config.json --restore_path /path/to/your/model.pth.tar```
|
||||
|
||||
To continue an old training run, use ```--continue_path```.
|
||||
|
||||
```python TTS/bin/train.py --continue_path /path/to/your/run_folder/```
|
||||
|
||||
For multi-GPU training use ```distribute.py```. It enables process based multi-GPU training where each process uses a single GPU.
|
||||
|
||||
```CUDA_VISIBLE_DEVICES="0,1,4" distribute.py --config_path config.json```
|
||||
```CUDA_VISIBLE_DEVICES="0,1,4" TTS/bin/distribute.py --config_path TTS/tts/configs/config.json```
|
||||
|
||||
Each run creates a new output folder and ```config.json``` is copied under this folder.
|
||||
|
||||
|
@ -187,7 +196,7 @@ If you like to use TTS to try a new idea and like to share your experiments with
|
|||
- [x] Enable process based distributed training. Similar to (https://github.com/fastai/imagenet-fast/).
|
||||
- [x] Adapting Neural Vocoder. TTS works with WaveRNN and ParallelWaveGAN (https://github.com/erogol/WaveRNN and https://github.com/erogol/ParallelWaveGAN)
|
||||
- [ ] Multi-speaker embedding.
|
||||
- [ ] Model optimization (model export, model pruning etc.)
|
||||
- [x] Model optimization (model export, model pruning etc.)
|
||||
|
||||
<!--## References
|
||||
- [Efficient Neural Audio Synthesis](https://arxiv.org/pdf/1802.08435.pdf)
|
||||
|
@ -203,3 +212,4 @@ If you like to use TTS to try a new idea and like to share your experiments with
|
|||
### References
|
||||
- https://github.com/keithito/tacotron (Dataset pre-processing)
|
||||
- https://github.com/r9y9/tacotron_pytorch (Initial Tacotron architecture)
|
||||
- https://github.com/kan-bayashi/ParallelWaveGAN (vocoder library)
|
||||
|
|
|
@ -7,16 +7,16 @@ import argparse
|
|||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
from TTS.datasets.preprocess import load_meta_data
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from mozilla_voice_tts.tts.datasets.preprocess import load_meta_data
|
||||
from mozilla_voice_tts.utils.io import load_config
|
||||
from mozilla_voice_tts.utils.audio import AudioProcessor
|
||||
|
||||
def main():
|
||||
"""Run preprocessing process."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Compute mean and variance of spectrogtram features.")
|
||||
parser.add_argument("--config_path", type=str, required=True,
|
||||
help="TTS config file path.")
|
||||
help="TTS config file path to define audio processin parameters.")
|
||||
parser.add_argument("--out_path", default=None, type=str,
|
||||
help="directory to save the output file.")
|
||||
args = parser.parse_args()
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
import argparse
|
||||
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.vocoder.tf.utils.generic_utils import setup_generator
|
||||
from TTS.vocoder.tf.utils.io import load_checkpoint
|
||||
from TTS.vocoder.tf.utils.tflite import convert_melgan_to_tflite
|
||||
from mozilla_voice_tts.utils.io import load_config
|
||||
from mozilla_voice_tts.vocoder.tf.utils.generic_utils import setup_generator
|
||||
from mozilla_voice_tts.vocoder.tf.utils.io import load_checkpoint
|
||||
from mozilla_voice_tts.vocoder.tf.utils.tflite import convert_melgan_to_tflite
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
@ -30,4 +30,3 @@ model = load_checkpoint(model, args.tf_model)
|
|||
|
||||
# create tflite model
|
||||
tflite_model = convert_melgan_to_tflite(model, output_path=args.output_path)
|
||||
|
|
@ -6,13 +6,13 @@ import tensorflow as tf
|
|||
import torch
|
||||
from fuzzywuzzy import fuzz
|
||||
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.vocoder.tf.utils.convert_torch_to_tf_utils import (
|
||||
from mozilla_voice_tts.utils.io import load_config
|
||||
from mozilla_voice_tts.vocoder.tf.utils.convert_torch_to_tf_utils import (
|
||||
compare_torch_tf, convert_tf_name, transfer_weights_torch_to_tf)
|
||||
from TTS.vocoder.tf.utils.generic_utils import \
|
||||
from mozilla_voice_tts.vocoder.tf.utils.generic_utils import \
|
||||
setup_generator as setup_tf_generator
|
||||
from TTS.vocoder.tf.utils.io import save_checkpoint
|
||||
from TTS.vocoder.utils.generic_utils import setup_generator
|
||||
from mozilla_voice_tts.vocoder.tf.utils.io import save_checkpoint
|
||||
from mozilla_voice_tts.vocoder.utils.generic_utils import setup_generator
|
||||
|
||||
# prevent GPU use
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = ''
|
||||
|
@ -114,4 +114,3 @@ assert compare_torch_tf(output_torch, output_tf) < 1e-5, compare_torch_tf(
|
|||
save_checkpoint(model_tf, checkpoint['step'], checkpoint['epoch'],
|
||||
args.output_path)
|
||||
print(' > Model conversion is successfully completed :).')
|
||||
|
|
@ -2,11 +2,11 @@
|
|||
|
||||
import argparse
|
||||
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.utils.text.symbols import symbols, phonemes
|
||||
from TTS.tf.utils.generic_utils import setup_model
|
||||
from TTS.tf.utils.io import load_checkpoint
|
||||
from TTS.tf.utils.tflite import convert_tacotron2_to_tflite
|
||||
from mozilla_voice_tts.utils.io import load_config
|
||||
from mozilla_voice_tts.tts.utils.text.symbols import symbols, phonemes
|
||||
from mozilla_voice_tts.tts.tf.utils.generic_utils import setup_model
|
||||
from mozilla_voice_tts.tts.tf.utils.io import load_checkpoint
|
||||
from mozilla_voice_tts.tts.tf.utils.tflite import convert_tacotron2_to_tflite
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
@ -34,4 +34,4 @@ model = load_checkpoint(model, args.tf_model)
|
|||
model.decoder.set_max_decoder_steps(1000)
|
||||
|
||||
# create tflite model
|
||||
tflite_model = convert_tacotron2_to_tflite(model, output_path=args.output_path)
|
||||
tflite_model = convert_tacotron2_to_tflite(model, output_path=args.output_path)
|
|
@ -1,21 +1,27 @@
|
|||
# %%
|
||||
import sys
|
||||
sys.path.append('/home/erogol/Projects')
|
||||
import os
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = ''
|
||||
# %%
|
||||
import argparse
|
||||
import numpy as np
|
||||
import torch
|
||||
import tensorflow as tf
|
||||
from fuzzywuzzy import fuzz
|
||||
import os
|
||||
import sys
|
||||
# %%
|
||||
# print variable match
|
||||
from pprint import pprint
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import torch
|
||||
from fuzzywuzzy import fuzz
|
||||
from mozilla_voice_tts.tts.tf.models.tacotron2 import Tacotron2
|
||||
from mozilla_voice_tts.tts.tf.utils.convert_torch_to_tf_utils import (
|
||||
compare_torch_tf, convert_tf_name, transfer_weights_torch_to_tf)
|
||||
from mozilla_voice_tts.tts.tf.utils.generic_utils import save_checkpoint
|
||||
from mozilla_voice_tts.tts.utils.generic_utils import setup_model
|
||||
from mozilla_voice_tts.tts.utils.text.symbols import phonemes, symbols
|
||||
from mozilla_voice_tts.utils.io import load_config
|
||||
|
||||
sys.path.append('/home/erogol/Projects')
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = ''
|
||||
|
||||
from TTS.utils.text.symbols import phonemes, symbols
|
||||
from TTS.utils.generic_utils import setup_model
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.tf.models.tacotron2 import Tacotron2
|
||||
from TTS.tf.utils.convert_torch_to_tf_utils import compare_torch_tf, tf_create_dummy_inputs, transfer_weights_torch_to_tf, convert_tf_name
|
||||
from TTS.tf.utils.generic_utils import save_checkpoint
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--torch_model_path',
|
||||
|
@ -86,7 +92,7 @@ var_map = [
|
|||
|
||||
# %%
|
||||
# get tf_model graph
|
||||
mel_pred = model_tf.build_inference()
|
||||
model_tf.build_inference()
|
||||
|
||||
# get tf variables
|
||||
tf_vars = model_tf.weights
|
||||
|
@ -108,9 +114,6 @@ for tf_name in tf_var_names:
|
|||
del torch_var_names[max_idx]
|
||||
var_map.append((tf_name, matching_name))
|
||||
|
||||
# %%
|
||||
# print variable match
|
||||
from pprint import pprint
|
||||
pprint(var_map)
|
||||
pprint(torch_var_names)
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import sys
|
||||
import pathlib
|
||||
import time
|
||||
import subprocess
|
||||
import argparse
|
||||
import torch
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Call train.py as a new process and pass command arguments
|
||||
"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'--continue_path',
|
||||
type=str,
|
||||
help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
|
||||
default='',
|
||||
required='--config_path' not in sys.argv)
|
||||
parser.add_argument(
|
||||
'--restore_path',
|
||||
type=str,
|
||||
help='Model file to be restored. Use to finetune a model.',
|
||||
default='')
|
||||
parser.add_argument(
|
||||
'--config_path',
|
||||
type=str,
|
||||
help='Path to config file for training.',
|
||||
required='--continue_path' not in sys.argv
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
num_gpus = torch.cuda.device_count()
|
||||
group_id = time.strftime("%Y_%m_%d-%H%M%S")
|
||||
|
||||
# set arguments for train.py
|
||||
folder_path = pathlib.Path(__file__).parent.absolute()
|
||||
command = [os.path.join(folder_path, 'train_tts.py')]
|
||||
command.append('--continue_path={}'.format(args.continue_path))
|
||||
command.append('--restore_path={}'.format(args.restore_path))
|
||||
command.append('--config_path={}'.format(args.config_path))
|
||||
command.append('--group_id=group_{}'.format(group_id))
|
||||
command.append('')
|
||||
|
||||
# run processes
|
||||
processes = []
|
||||
for i in range(num_gpus):
|
||||
my_env = os.environ.copy()
|
||||
my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
|
||||
command[-1] = '--rank={}'.format(i)
|
||||
stdout = None if i == 0 else open(os.devnull, 'w')
|
||||
p = subprocess.Popen(['python3'] + command, stdout=stdout, env=my_env)
|
||||
processes.append(p)
|
||||
print(command)
|
||||
|
||||
for p in processes:
|
||||
p.wait()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -0,0 +1,174 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import argparse
|
||||
import json
|
||||
# pylint: disable=redefined-outer-name, unused-argument
|
||||
import os
|
||||
import string
|
||||
import time
|
||||
|
||||
import torch
|
||||
|
||||
from mozilla_voice_tts.tts.utils.generic_utils import setup_model
|
||||
from mozilla_voice_tts.tts.utils.synthesis import synthesis
|
||||
from mozilla_voice_tts.tts.utils.text.symbols import make_symbols, phonemes, symbols
|
||||
from mozilla_voice_tts.utils.audio import AudioProcessor
|
||||
from mozilla_voice_tts.utils.io import load_config
|
||||
from mozilla_voice_tts.vocoder.utils.generic_utils import setup_generator
|
||||
|
||||
|
||||
def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None, gst_style=None):
|
||||
t_1 = time.time()
|
||||
waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, gst_style, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)
|
||||
if CONFIG.model == "Tacotron" and not use_gl:
|
||||
mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T
|
||||
if not use_gl:
|
||||
waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))
|
||||
if use_cuda and not use_gl:
|
||||
waveform = waveform.cpu()
|
||||
if not use_gl:
|
||||
waveform = waveform.numpy()
|
||||
waveform = waveform.squeeze()
|
||||
rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)
|
||||
tps = (time.time() - t_1) / len(waveform)
|
||||
print(" > Run-time: {}".format(time.time() - t_1))
|
||||
print(" > Real-time factor: {}".format(rtf))
|
||||
print(" > Time per step: {}".format(tps))
|
||||
return waveform
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('text', type=str, help='Text to generate speech.')
|
||||
parser.add_argument('config_path',
|
||||
type=str,
|
||||
help='Path to model config file.')
|
||||
parser.add_argument(
|
||||
'model_path',
|
||||
type=str,
|
||||
help='Path to model file.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'out_path',
|
||||
type=str,
|
||||
help='Path to save final wav file. Wav file will be names as the text given.',
|
||||
)
|
||||
parser.add_argument('--use_cuda',
|
||||
type=bool,
|
||||
help='Run model on CUDA.',
|
||||
default=False)
|
||||
parser.add_argument(
|
||||
'--vocoder_path',
|
||||
type=str,
|
||||
help=
|
||||
'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).',
|
||||
default="",
|
||||
)
|
||||
parser.add_argument('--vocoder_config_path',
|
||||
type=str,
|
||||
help='Path to vocoder model config file.',
|
||||
default="")
|
||||
parser.add_argument(
|
||||
'--batched_vocoder',
|
||||
type=bool,
|
||||
help="If True, vocoder model uses faster batch processing.",
|
||||
default=True)
|
||||
parser.add_argument('--speakers_json',
|
||||
type=str,
|
||||
help="JSON file for multi-speaker model.",
|
||||
default="")
|
||||
parser.add_argument(
|
||||
'--speaker_fileid',
|
||||
type=str,
|
||||
help="if CONFIG.use_external_speaker_embedding_file is true, name of speaker embedding reference file present in speakers.json, else target speaker_fileid if the model is multi-speaker.",
|
||||
default=None)
|
||||
parser.add_argument(
|
||||
'--gst_style',
|
||||
help="Wav path file for GST stylereference.",
|
||||
default=None)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# load the config
|
||||
C = load_config(args.config_path)
|
||||
C.forward_attn_mask = True
|
||||
|
||||
# load the audio processor
|
||||
ap = AudioProcessor(**C.audio)
|
||||
|
||||
# if the vocabulary was passed, replace the default
|
||||
if 'characters' in C.keys():
|
||||
symbols, phonemes = make_symbols(**C.characters)
|
||||
|
||||
speaker_embedding = None
|
||||
speaker_embedding_dim = None
|
||||
num_speakers = 0
|
||||
|
||||
# load speakers
|
||||
if args.speakers_json != '':
|
||||
speaker_mapping = json.load(open(args.speakers_json, 'r'))
|
||||
num_speakers = len(speaker_mapping)
|
||||
if C.use_external_speaker_embedding_file:
|
||||
if args.speaker_fileid is not None:
|
||||
speaker_embedding = speaker_mapping[args.speaker_fileid]['embedding']
|
||||
else: # if speaker_fileid is not specificated use the first sample in speakers.json
|
||||
speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']
|
||||
speaker_embedding_dim = len(speaker_embedding)
|
||||
|
||||
# load the model
|
||||
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
|
||||
model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)
|
||||
cp = torch.load(args.model_path, map_location=torch.device('cpu'))
|
||||
model.load_state_dict(cp['model'])
|
||||
model.eval()
|
||||
if args.use_cuda:
|
||||
model.cuda()
|
||||
model.decoder.set_r(cp['r'])
|
||||
|
||||
# load vocoder model
|
||||
if args.vocoder_path != "":
|
||||
VC = load_config(args.vocoder_config_path)
|
||||
vocoder_model = setup_generator(VC)
|
||||
vocoder_model.load_state_dict(torch.load(args.vocoder_path, map_location="cpu")["model"])
|
||||
vocoder_model.remove_weight_norm()
|
||||
if args.use_cuda:
|
||||
vocoder_model.cuda()
|
||||
vocoder_model.eval()
|
||||
else:
|
||||
vocoder_model = None
|
||||
VC = None
|
||||
|
||||
# synthesize voice
|
||||
use_griffin_lim = args.vocoder_path == ""
|
||||
print(" > Text: {}".format(args.text))
|
||||
|
||||
if not C.use_external_speaker_embedding_file:
|
||||
if args.speaker_fileid.isdigit():
|
||||
args.speaker_fileid = int(args.speaker_fileid)
|
||||
else:
|
||||
args.speaker_fileid = None
|
||||
else:
|
||||
args.speaker_fileid = None
|
||||
|
||||
if args.gst_style is None:
|
||||
gst_style = C.gst['gst_style_input']
|
||||
else:
|
||||
# check if gst_style string is a dict, if is dict convert else use string
|
||||
try:
|
||||
gst_style = json.loads(args.gst_style)
|
||||
if max(map(int, gst_style.keys())) >= C.gst['gst_style_tokens']:
|
||||
raise RuntimeError("The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}".format(max(map(int, gst_style.keys())), C.gst['gst_style_tokens']))
|
||||
except ValueError:
|
||||
gst_style = args.gst_style
|
||||
|
||||
wav = tts(model, vocoder_model, args.text, C, args.use_cuda, ap, use_griffin_lim, args.speaker_fileid, speaker_embedding=speaker_embedding, gst_style=gst_style)
|
||||
|
||||
# save the results
|
||||
file_name = args.text.replace(" ", "_")
|
||||
file_name = file_name.translate(
|
||||
str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'
|
||||
out_path = os.path.join(args.out_path, file_name)
|
||||
print(" > Saving output to {}".format(out_path))
|
||||
ap.save_wav(wav, out_path)
|
|
@ -1,3 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
@ -6,19 +9,22 @@ import traceback
|
|||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from TTS.datasets.preprocess import load_meta_data
|
||||
from TTS.speaker_encoder.dataset import MyDataset
|
||||
from TTS.speaker_encoder.loss import GE2ELoss
|
||||
from TTS.speaker_encoder.model import SpeakerEncoder
|
||||
from TTS.speaker_encoder.visual import plot_embeddings
|
||||
from TTS.speaker_encoder.generic_utils import save_best_model
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.generic_utils import (create_experiment_folder, get_git_branch,
|
||||
remove_experiment_folder, set_init_dict)
|
||||
from TTS.utils.io import load_config, copy_config_file
|
||||
from TTS.utils.training import check_update, NoamLR
|
||||
from TTS.utils.tensorboard_logger import TensorboardLogger
|
||||
from TTS.utils.radam import RAdam
|
||||
|
||||
from mozilla_voice_tts.speaker_encoder.dataset import MyDataset
|
||||
from mozilla_voice_tts.speaker_encoder.generic_utils import save_best_model
|
||||
from mozilla_voice_tts.speaker_encoder.losses import GE2ELoss, AngleProtoLoss
|
||||
from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder
|
||||
from mozilla_voice_tts.speaker_encoder.visual import plot_embeddings
|
||||
from mozilla_voice_tts.tts.datasets.preprocess import load_meta_data
|
||||
from mozilla_voice_tts.tts.utils.generic_utils import (
|
||||
create_experiment_folder, get_git_branch, remove_experiment_folder,
|
||||
set_init_dict)
|
||||
from mozilla_voice_tts.tts.utils.io import copy_config_file, load_config
|
||||
from mozilla_voice_tts.utils.audio import AudioProcessor
|
||||
from mozilla_voice_tts.utils.generic_utils import count_parameters
|
||||
from mozilla_voice_tts.utils.radam import RAdam
|
||||
from mozilla_voice_tts.utils.tensorboard_logger import TensorboardLogger
|
||||
from mozilla_voice_tts.utils.training import NoamLR, check_update
|
||||
|
||||
torch.backends.cudnn.enabled = True
|
||||
torch.backends.cudnn.benchmark = True
|
||||
|
@ -94,7 +100,7 @@ def train(model, criterion, optimizer, scheduler, ap, global_step):
|
|||
if global_step % c.steps_plot_stats == 0:
|
||||
# Plot Training Epoch Stats
|
||||
train_stats = {
|
||||
"GE2Eloss": avg_loss,
|
||||
"loss": avg_loss,
|
||||
"lr": current_lr,
|
||||
"grad_norm": grad_norm,
|
||||
"step_time": step_time
|
||||
|
@ -129,12 +135,18 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
global meta_data_eval
|
||||
|
||||
ap = AudioProcessor(**c.audio)
|
||||
model = SpeakerEncoder(input_dim=40,
|
||||
proj_dim=128,
|
||||
lstm_dim=384,
|
||||
num_lstm_layers=3)
|
||||
model = SpeakerEncoder(input_dim=c.model['input_dim'],
|
||||
proj_dim=c.model['proj_dim'],
|
||||
lstm_dim=c.model['lstm_dim'],
|
||||
num_lstm_layers=c.model['num_lstm_layers'])
|
||||
optimizer = RAdam(model.parameters(), lr=c.lr)
|
||||
criterion = GE2ELoss(loss_method='softmax')
|
||||
|
||||
if c.loss == "ge2e":
|
||||
criterion = GE2ELoss(loss_method='softmax')
|
||||
elif c.loss == "angleproto":
|
||||
criterion = AngleProtoLoss()
|
||||
else:
|
||||
raise Exception("The %s not is a loss supported" % c.loss)
|
||||
|
||||
if args.restore_path:
|
||||
checkpoint = torch.load(args.restore_path)
|
||||
|
@ -177,8 +189,8 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
meta_data_train, meta_data_eval = load_meta_data(c.datasets)
|
||||
|
||||
global_step = args.restore_step
|
||||
train_loss, global_step = train(model, criterion, optimizer, scheduler, ap,
|
||||
global_step)
|
||||
_, global_step = train(model, criterion, optimizer, scheduler, ap,
|
||||
global_step)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -236,7 +248,7 @@ if __name__ == '__main__':
|
|||
new_fields)
|
||||
|
||||
LOG_DIR = OUT_PATH
|
||||
tb_logger = TensorboardLogger(LOG_DIR)
|
||||
tb_logger = TensorboardLogger(LOG_DIR, model_name='Speaker_Encoder')
|
||||
|
||||
try:
|
||||
main(args)
|
|
@ -1,7 +1,10 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import time
|
||||
import traceback
|
||||
|
||||
|
@ -9,42 +12,51 @@ import numpy as np
|
|||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from TTS.datasets.TTSDataset import MyDataset
|
||||
from distribute import (DistributedSampler, apply_gradient_allreduce,
|
||||
init_distributed, reduce_tensor)
|
||||
from TTS.layers.losses import TacotronLoss
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.generic_utils import (count_parameters, create_experiment_folder, remove_experiment_folder,
|
||||
get_git_branch, set_init_dict,
|
||||
setup_model, KeepAverage, check_config)
|
||||
from TTS.utils.io import (save_best_model, save_checkpoint,
|
||||
load_config, copy_config_file)
|
||||
from TTS.utils.training import (NoamLR, check_update, adam_weight_decay,
|
||||
gradual_training_scheduler, set_weight_decay,
|
||||
setup_torch_training_env)
|
||||
from TTS.utils.tensorboard_logger import TensorboardLogger
|
||||
from TTS.utils.console_logger import ConsoleLogger
|
||||
from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \
|
||||
get_speakers
|
||||
from TTS.utils.synthesis import synthesis
|
||||
from TTS.utils.text.symbols import make_symbols, phonemes, symbols
|
||||
from TTS.utils.visual import plot_alignment, plot_spectrogram
|
||||
from TTS.datasets.preprocess import load_meta_data
|
||||
from TTS.utils.radam import RAdam
|
||||
from TTS.utils.measures import alignment_diagonal_score
|
||||
|
||||
from mozilla_voice_tts.tts.datasets.preprocess import load_meta_data
|
||||
from mozilla_voice_tts.tts.datasets.TTSDataset import MyDataset
|
||||
from mozilla_voice_tts.tts.layers.losses import TacotronLoss
|
||||
from mozilla_voice_tts.tts.utils.distribute import (DistributedSampler,
|
||||
apply_gradient_allreduce,
|
||||
init_distributed,
|
||||
reduce_tensor)
|
||||
from mozilla_voice_tts.tts.utils.generic_utils import check_config, setup_model
|
||||
from mozilla_voice_tts.tts.utils.io import save_best_model, save_checkpoint
|
||||
from mozilla_voice_tts.tts.utils.measures import alignment_diagonal_score
|
||||
from mozilla_voice_tts.tts.utils.speakers import (get_speakers,
|
||||
load_speaker_mapping,
|
||||
save_speaker_mapping)
|
||||
from mozilla_voice_tts.tts.utils.synthesis import synthesis
|
||||
from mozilla_voice_tts.tts.utils.text.symbols import (make_symbols, phonemes,
|
||||
symbols)
|
||||
from mozilla_voice_tts.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||
from mozilla_voice_tts.utils.audio import AudioProcessor
|
||||
from mozilla_voice_tts.utils.console_logger import ConsoleLogger
|
||||
from mozilla_voice_tts.utils.generic_utils import (KeepAverage,
|
||||
count_parameters,
|
||||
create_experiment_folder,
|
||||
get_git_branch,
|
||||
remove_experiment_folder,
|
||||
set_init_dict)
|
||||
from mozilla_voice_tts.utils.io import copy_config_file, load_config
|
||||
from mozilla_voice_tts.utils.radam import RAdam
|
||||
from mozilla_voice_tts.utils.tensorboard_logger import TensorboardLogger
|
||||
from mozilla_voice_tts.utils.training import (NoamLR, adam_weight_decay,
|
||||
check_update,
|
||||
gradual_training_scheduler,
|
||||
set_weight_decay,
|
||||
setup_torch_training_env)
|
||||
|
||||
use_cuda, num_gpus = setup_torch_training_env(True, False)
|
||||
|
||||
|
||||
def setup_loader(ap, r, is_val=False, verbose=False):
|
||||
def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None):
|
||||
if is_val and not c.run_eval:
|
||||
loader = None
|
||||
else:
|
||||
dataset = MyDataset(
|
||||
r,
|
||||
c.text_cleaner,
|
||||
compute_linear_spec=True if c.model.lower() == 'tacotron' else False,
|
||||
compute_linear_spec=c.model.lower() == 'tacotron',
|
||||
meta_data=meta_data_eval if is_val else meta_data_train,
|
||||
ap=ap,
|
||||
tp=c.characters if 'characters' in c.keys() else None,
|
||||
|
@ -56,7 +68,8 @@ def setup_loader(ap, r, is_val=False, verbose=False):
|
|||
use_phonemes=c.use_phonemes,
|
||||
phoneme_language=c.phoneme_language,
|
||||
enable_eos_bos=c.enable_eos_bos_chars,
|
||||
verbose=verbose)
|
||||
verbose=verbose,
|
||||
speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None)
|
||||
sampler = DistributedSampler(dataset) if num_gpus > 1 else None
|
||||
loader = DataLoader(
|
||||
dataset,
|
||||
|
@ -70,9 +83,8 @@ def setup_loader(ap, r, is_val=False, verbose=False):
|
|||
pin_memory=False)
|
||||
return loader
|
||||
|
||||
|
||||
def format_data(data):
|
||||
if c.use_speaker_embedding:
|
||||
def format_data(data, speaker_mapping=None):
|
||||
if speaker_mapping is None and c.use_speaker_embedding and not c.use_external_speaker_embedding_file:
|
||||
speaker_mapping = load_speaker_mapping(OUT_PATH)
|
||||
|
||||
# setup input data
|
||||
|
@ -87,13 +99,20 @@ def format_data(data):
|
|||
avg_spec_length = torch.mean(mel_lengths.float())
|
||||
|
||||
if c.use_speaker_embedding:
|
||||
speaker_ids = [
|
||||
speaker_mapping[speaker_name] for speaker_name in speaker_names
|
||||
]
|
||||
speaker_ids = torch.LongTensor(speaker_ids)
|
||||
if c.use_external_speaker_embedding_file:
|
||||
speaker_embeddings = data[8]
|
||||
speaker_ids = None
|
||||
else:
|
||||
speaker_ids = [
|
||||
speaker_mapping[speaker_name] for speaker_name in speaker_names
|
||||
]
|
||||
speaker_ids = torch.LongTensor(speaker_ids)
|
||||
speaker_embeddings = None
|
||||
else:
|
||||
speaker_embeddings = None
|
||||
speaker_ids = None
|
||||
|
||||
|
||||
# set stop targets view, we predict a single stop token per iteration.
|
||||
stop_targets = stop_targets.view(text_input.shape[0],
|
||||
stop_targets.size(1) // c.r, -1)
|
||||
|
@ -110,13 +129,16 @@ def format_data(data):
|
|||
stop_targets = stop_targets.cuda(non_blocking=True)
|
||||
if speaker_ids is not None:
|
||||
speaker_ids = speaker_ids.cuda(non_blocking=True)
|
||||
return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length
|
||||
if speaker_embeddings is not None:
|
||||
speaker_embeddings = speaker_embeddings.cuda(non_blocking=True)
|
||||
|
||||
return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, avg_text_length, avg_spec_length
|
||||
|
||||
|
||||
def train(model, criterion, optimizer, optimizer_st, scheduler,
|
||||
ap, global_step, epoch):
|
||||
ap, global_step, epoch, amp, speaker_mapping=None):
|
||||
data_loader = setup_loader(ap, model.decoder.r, is_val=False,
|
||||
verbose=(epoch == 0))
|
||||
verbose=(epoch == 0), speaker_mapping=speaker_mapping)
|
||||
model.train()
|
||||
epoch_time = 0
|
||||
keep_avg = KeepAverage()
|
||||
|
@ -131,7 +153,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
|||
start_time = time.time()
|
||||
|
||||
# format data
|
||||
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length = format_data(data)
|
||||
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, avg_text_length, avg_spec_length = format_data(data, speaker_mapping)
|
||||
loader_time = time.time() - end_time
|
||||
|
||||
global_step += 1
|
||||
|
@ -146,14 +168,14 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
|||
# forward pass model
|
||||
if c.bidirectional_decoder or c.double_decoder_consistency:
|
||||
decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
|
||||
text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids)
|
||||
text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings)
|
||||
else:
|
||||
decoder_output, postnet_output, alignments, stop_tokens = model(
|
||||
text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids)
|
||||
text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings)
|
||||
decoder_backward_output = None
|
||||
alignments_backward = None
|
||||
|
||||
# set the alignment lengths wrt reduction factor for guided attention
|
||||
# set the [alignment] lengths wrt reduction factor for guided attention
|
||||
if mel_lengths.max() % model.decoder.r != 0:
|
||||
alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r
|
||||
else:
|
||||
|
@ -167,9 +189,18 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
|||
text_lengths)
|
||||
|
||||
# backward pass
|
||||
loss_dict['loss'].backward()
|
||||
if amp is not None:
|
||||
with amp.scale_loss(loss_dict['loss'], optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
else:
|
||||
loss_dict['loss'].backward()
|
||||
|
||||
optimizer, current_lr = adam_weight_decay(optimizer)
|
||||
grad_norm, _ = check_update(model, c.grad_clip, ignore_stopnet=True)
|
||||
if amp:
|
||||
amp_opt_params = amp.master_params(optimizer)
|
||||
else:
|
||||
amp_opt_params = None
|
||||
grad_norm, _ = check_update(model, c.grad_clip, ignore_stopnet=True, amp_opt_params=amp_opt_params)
|
||||
optimizer.step()
|
||||
|
||||
# compute alignment error (the lower the better )
|
||||
|
@ -180,7 +211,11 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
|||
if c.separate_stopnet:
|
||||
loss_dict['stopnet_loss'].backward()
|
||||
optimizer_st, _ = adam_weight_decay(optimizer_st)
|
||||
grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0)
|
||||
if amp:
|
||||
amp_opt_params = amp.master_params(optimizer)
|
||||
else:
|
||||
amp_opt_params = None
|
||||
grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0, amp_opt_params=amp_opt_params)
|
||||
optimizer_st.step()
|
||||
else:
|
||||
grad_norm_st = 0
|
||||
|
@ -214,10 +249,15 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
|||
|
||||
# print training progress
|
||||
if global_step % c.print_step == 0:
|
||||
log_dict = {
|
||||
"avg_spec_length": [avg_spec_length, 1], # value, precision
|
||||
"avg_text_length": [avg_text_length, 1],
|
||||
"step_time": [step_time, 4],
|
||||
"loader_time": [loader_time, 2],
|
||||
"current_lr": current_lr,
|
||||
}
|
||||
c_logger.print_train_step(batch_n_iter, num_iter, global_step,
|
||||
avg_spec_length, avg_text_length,
|
||||
step_time, loader_time, current_lr,
|
||||
loss_dict, keep_avg.avg_values)
|
||||
log_dict, loss_dict, keep_avg.avg_values)
|
||||
|
||||
if args.rank == 0:
|
||||
# Plot Training Iter Stats
|
||||
|
@ -237,7 +277,8 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
|||
# save model
|
||||
save_checkpoint(model, optimizer, global_step, epoch, model.decoder.r, OUT_PATH,
|
||||
optimizer_st=optimizer_st,
|
||||
model_loss=loss_dict['postnet_loss'])
|
||||
model_loss=loss_dict['postnet_loss'],
|
||||
amp_state_dict=amp.state_dict() if amp else None)
|
||||
|
||||
# Diagnostic visualizations
|
||||
const_spec = postnet_output[0].data.cpu().numpy()
|
||||
|
@ -247,13 +288,13 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
|||
align_img = alignments[0].data.cpu().numpy()
|
||||
|
||||
figures = {
|
||||
"prediction": plot_spectrogram(const_spec, ap),
|
||||
"ground_truth": plot_spectrogram(gt_spec, ap),
|
||||
"alignment": plot_alignment(align_img),
|
||||
"prediction": plot_spectrogram(const_spec, ap, output_fig=False),
|
||||
"ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
|
||||
"alignment": plot_alignment(align_img, output_fig=False),
|
||||
}
|
||||
|
||||
if c.bidirectional_decoder or c.double_decoder_consistency:
|
||||
figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy())
|
||||
figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy(), output_fig=False)
|
||||
|
||||
tb_logger.tb_train_figures(global_step, figures)
|
||||
|
||||
|
@ -281,8 +322,8 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
|||
|
||||
|
||||
@torch.no_grad()
|
||||
def evaluate(model, criterion, ap, global_step, epoch):
|
||||
data_loader = setup_loader(ap, model.decoder.r, is_val=True)
|
||||
def evaluate(model, criterion, ap, global_step, epoch, speaker_mapping=None):
|
||||
data_loader = setup_loader(ap, model.decoder.r, is_val=True, speaker_mapping=speaker_mapping)
|
||||
model.eval()
|
||||
epoch_time = 0
|
||||
keep_avg = KeepAverage()
|
||||
|
@ -292,16 +333,16 @@ def evaluate(model, criterion, ap, global_step, epoch):
|
|||
start_time = time.time()
|
||||
|
||||
# format data
|
||||
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data(data)
|
||||
text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, _, _ = format_data(data, speaker_mapping)
|
||||
assert mel_input.shape[1] % model.decoder.r == 0
|
||||
|
||||
# forward pass model
|
||||
if c.bidirectional_decoder or c.double_decoder_consistency:
|
||||
decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
|
||||
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
|
||||
text_input, text_lengths, mel_input, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings)
|
||||
else:
|
||||
decoder_output, postnet_output, alignments, stop_tokens = model(
|
||||
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
|
||||
text_input, text_lengths, mel_input, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings)
|
||||
decoder_backward_output = None
|
||||
alignments_backward = None
|
||||
|
||||
|
@ -361,9 +402,9 @@ def evaluate(model, criterion, ap, global_step, epoch):
|
|||
align_img = alignments[idx].data.cpu().numpy()
|
||||
|
||||
eval_figures = {
|
||||
"prediction": plot_spectrogram(const_spec, ap),
|
||||
"ground_truth": plot_spectrogram(gt_spec, ap),
|
||||
"alignment": plot_alignment(align_img)
|
||||
"prediction": plot_spectrogram(const_spec, ap, output_fig=False),
|
||||
"ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
|
||||
"alignment": plot_alignment(align_img, output_fig=False)
|
||||
}
|
||||
|
||||
# Sample audio
|
||||
|
@ -378,7 +419,7 @@ def evaluate(model, criterion, ap, global_step, epoch):
|
|||
|
||||
if c.bidirectional_decoder or c.double_decoder_consistency:
|
||||
align_b_img = alignments_backward[idx].data.cpu().numpy()
|
||||
eval_figures['alignment2'] = plot_alignment(align_b_img)
|
||||
eval_figures['alignment2'] = plot_alignment(align_b_img, output_fig=False)
|
||||
tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
|
||||
tb_logger.tb_eval_figures(global_step, eval_figures)
|
||||
|
||||
|
@ -403,7 +444,7 @@ def evaluate(model, criterion, ap, global_step, epoch):
|
|||
style_wav = c.get("style_wav_for_test")
|
||||
for idx, test_sentence in enumerate(test_sentences):
|
||||
try:
|
||||
wav, alignment, decoder_output, postnet_output, stop_tokens, inputs = synthesis(
|
||||
wav, alignment, decoder_output, postnet_output, stop_tokens, _ = synthesis(
|
||||
model,
|
||||
test_sentence,
|
||||
c,
|
||||
|
@ -423,10 +464,10 @@ def evaluate(model, criterion, ap, global_step, epoch):
|
|||
ap.save_wav(wav, file_path)
|
||||
test_audios['{}-audio'.format(idx)] = wav
|
||||
test_figures['{}-prediction'.format(idx)] = plot_spectrogram(
|
||||
postnet_output, ap)
|
||||
postnet_output, ap, output_fig=False)
|
||||
test_figures['{}-alignment'.format(idx)] = plot_alignment(
|
||||
alignment)
|
||||
except:
|
||||
alignment, output_fig=False)
|
||||
except: #pylint: disable=bare-except
|
||||
print(" !! Error creating Test Sentence -", idx)
|
||||
traceback.print_exc()
|
||||
tb_logger.tb_test_audios(global_step, test_audios,
|
||||
|
@ -453,26 +494,51 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
# load data instances
|
||||
meta_data_train, meta_data_eval = load_meta_data(c.datasets)
|
||||
|
||||
# set the portion of the data used for training
|
||||
if 'train_portion' in c.keys():
|
||||
meta_data_train = meta_data_train[:int(len(meta_data_train) * c.train_portion)]
|
||||
if 'eval_portion' in c.keys():
|
||||
meta_data_eval = meta_data_eval[:int(len(meta_data_eval) * c.eval_portion)]
|
||||
|
||||
# parse speakers
|
||||
if c.use_speaker_embedding:
|
||||
speakers = get_speakers(meta_data_train)
|
||||
if args.restore_path:
|
||||
prev_out_path = os.path.dirname(args.restore_path)
|
||||
speaker_mapping = load_speaker_mapping(prev_out_path)
|
||||
assert all([speaker in speaker_mapping
|
||||
for speaker in speakers]), "As of now you, you cannot " \
|
||||
"introduce new speakers to " \
|
||||
"a previously trained model."
|
||||
else:
|
||||
if c.use_external_speaker_embedding_file: # if restore checkpoint and use External Embedding file
|
||||
prev_out_path = os.path.dirname(args.restore_path)
|
||||
speaker_mapping = load_speaker_mapping(prev_out_path)
|
||||
if not speaker_mapping:
|
||||
print("WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file")
|
||||
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
|
||||
if not speaker_mapping:
|
||||
raise RuntimeError("You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file")
|
||||
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding'])
|
||||
elif not c.use_external_speaker_embedding_file: # if restore checkpoint and don't use External Embedding file
|
||||
prev_out_path = os.path.dirname(args.restore_path)
|
||||
speaker_mapping = load_speaker_mapping(prev_out_path)
|
||||
speaker_embedding_dim = None
|
||||
assert all([speaker in speaker_mapping
|
||||
for speaker in speakers]), "As of now you, you cannot " \
|
||||
"introduce new speakers to " \
|
||||
"a previously trained model."
|
||||
elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # if start new train using External Embedding file
|
||||
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
|
||||
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding'])
|
||||
elif c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file: # if start new train using External Embedding file and don't pass external embedding file
|
||||
raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder"
|
||||
else: # if start new train and don't use External Embedding file
|
||||
speaker_mapping = {name: i for i, name in enumerate(speakers)}
|
||||
speaker_embedding_dim = None
|
||||
save_speaker_mapping(OUT_PATH, speaker_mapping)
|
||||
num_speakers = len(speaker_mapping)
|
||||
print("Training with {} speakers: {}".format(num_speakers,
|
||||
", ".join(speakers)))
|
||||
else:
|
||||
num_speakers = 0
|
||||
speaker_embedding_dim = None
|
||||
speaker_mapping = None
|
||||
|
||||
model = setup_model(num_chars, num_speakers, c)
|
||||
model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim)
|
||||
|
||||
params = set_weight_decay(model, c.wd)
|
||||
optimizer = RAdam(params, lr=c.lr, weight_decay=0)
|
||||
|
@ -483,6 +549,14 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
else:
|
||||
optimizer_st = None
|
||||
|
||||
if c.apex_amp_level == "O1":
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from apex import amp
|
||||
model.cuda()
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level=c.apex_amp_level)
|
||||
else:
|
||||
amp = None
|
||||
|
||||
# setup criterion
|
||||
criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4)
|
||||
|
||||
|
@ -495,12 +569,18 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
if c.reinit_layers:
|
||||
raise RuntimeError
|
||||
model.load_state_dict(checkpoint['model'])
|
||||
except:
|
||||
except KeyError:
|
||||
print(" > Partial model initialization.")
|
||||
model_dict = model.state_dict()
|
||||
model_dict = set_init_dict(model_dict, checkpoint['model'], c)
|
||||
# torch.save(model_dict, os.path.join(OUT_PATH, 'state_dict.pt'))
|
||||
# print("State Dict saved for debug in: ", os.path.join(OUT_PATH, 'state_dict.pt'))
|
||||
model.load_state_dict(model_dict)
|
||||
del model_dict
|
||||
|
||||
if amp and 'amp' in checkpoint:
|
||||
amp.load_state_dict(checkpoint['amp'])
|
||||
|
||||
for group in optimizer.param_groups:
|
||||
group['lr'] = c.lr
|
||||
print(" > Model restored from step %d" % checkpoint['step'],
|
||||
|
@ -543,14 +623,14 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
print("\n > Number of output frames:", model.decoder.r)
|
||||
train_avg_loss_dict, global_step = train(model, criterion, optimizer,
|
||||
optimizer_st, scheduler, ap,
|
||||
global_step, epoch)
|
||||
global_step, epoch, amp, speaker_mapping)
|
||||
eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch)
|
||||
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
|
||||
target_loss = train_avg_loss_dict['avg_postnet_loss']
|
||||
if c.run_eval:
|
||||
target_loss = eval_avg_loss_dict['avg_postnet_loss']
|
||||
best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r,
|
||||
OUT_PATH)
|
||||
OUT_PATH, amp_state_dict=amp.state_dict() if amp else None)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -602,6 +682,9 @@ if __name__ == '__main__':
|
|||
check_config(c)
|
||||
_ = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
if c.apex_amp_level == 'O1':
|
||||
print(" > apex AMP level: ", c.apex_amp_level)
|
||||
|
||||
OUT_PATH = args.continue_path
|
||||
if args.continue_path == '':
|
||||
OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug)
|
|
@ -4,31 +4,34 @@ import os
|
|||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from inspect import signature
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from inspect import signature
|
||||
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.generic_utils import (KeepAverage, count_parameters,
|
||||
create_experiment_folder, get_git_branch,
|
||||
remove_experiment_folder, set_init_dict)
|
||||
from TTS.utils.io import copy_config_file, load_config
|
||||
from TTS.utils.radam import RAdam
|
||||
from TTS.utils.tensorboard_logger import TensorboardLogger
|
||||
from TTS.utils.training import setup_torch_training_env
|
||||
from TTS.vocoder.datasets.gan_dataset import GANDataset
|
||||
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
|
||||
from mozilla_voice_tts.utils.audio import AudioProcessor
|
||||
from mozilla_voice_tts.utils.console_logger import ConsoleLogger
|
||||
from mozilla_voice_tts.utils.generic_utils import (KeepAverage,
|
||||
count_parameters,
|
||||
create_experiment_folder,
|
||||
get_git_branch,
|
||||
remove_experiment_folder,
|
||||
set_init_dict)
|
||||
from mozilla_voice_tts.utils.io import copy_config_file, load_config
|
||||
from mozilla_voice_tts.utils.radam import RAdam
|
||||
from mozilla_voice_tts.utils.tensorboard_logger import TensorboardLogger
|
||||
from mozilla_voice_tts.utils.training import setup_torch_training_env
|
||||
from mozilla_voice_tts.vocoder.datasets.gan_dataset import GANDataset
|
||||
from mozilla_voice_tts.vocoder.datasets.preprocess import (load_wav_data,
|
||||
load_wav_feat_data)
|
||||
# from distribute import (DistributedSampler, apply_gradient_allreduce,
|
||||
# init_distributed, reduce_tensor)
|
||||
from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss
|
||||
from TTS.vocoder.utils.io import save_checkpoint, save_best_model
|
||||
from TTS.vocoder.utils.console_logger import ConsoleLogger
|
||||
from TTS.vocoder.utils.generic_utils import (check_config, plot_results,
|
||||
setup_discriminator,
|
||||
setup_generator)
|
||||
|
||||
from mozilla_voice_tts.vocoder.layers.losses import (DiscriminatorLoss,
|
||||
GeneratorLoss)
|
||||
from mozilla_voice_tts.vocoder.utils.generic_utils import (plot_results,
|
||||
setup_discriminator,
|
||||
setup_generator)
|
||||
from mozilla_voice_tts.vocoder.utils.io import save_best_model, save_checkpoint
|
||||
|
||||
use_cuda, num_gpus = setup_torch_training_env(True, True)
|
||||
|
||||
|
@ -124,6 +127,7 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
|
|||
y_hat_vis = y_hat
|
||||
y_G_sub = model_G.pqmf_analysis(y_G)
|
||||
|
||||
scores_fake, feats_fake, feats_real = None, None, None
|
||||
if global_step > c.steps_to_start_discriminator:
|
||||
|
||||
# run D with or without cond. features
|
||||
|
@ -146,8 +150,6 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
|
|||
_, feats_real = D_out_real
|
||||
else:
|
||||
scores_fake = D_out_fake
|
||||
else:
|
||||
scores_fake, feats_fake, feats_real = None, None, None
|
||||
|
||||
# compute losses
|
||||
loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake,
|
||||
|
@ -239,10 +241,14 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
|
|||
|
||||
# print training stats
|
||||
if global_step % c.print_step == 0:
|
||||
log_dict = {
|
||||
'step_time': [step_time, 2],
|
||||
'loader_time': [loader_time, 4],
|
||||
"current_lr_G": current_lr_G,
|
||||
"current_lr_D": current_lr_D
|
||||
}
|
||||
c_logger.print_train_step(batch_n_iter, num_iter, global_step,
|
||||
step_time, loader_time, current_lr_G,
|
||||
current_lr_D, loss_dict,
|
||||
keep_avg.avg_values)
|
||||
log_dict, loss_dict, keep_avg.avg_values)
|
||||
|
||||
# plot step stats
|
||||
if global_step % 10 == 0:
|
||||
|
@ -328,6 +334,7 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch)
|
|||
y_G_sub = model_G.pqmf_analysis(y_G)
|
||||
|
||||
|
||||
scores_fake, feats_fake, feats_real = None, None, None
|
||||
if global_step > c.steps_to_start_discriminator:
|
||||
|
||||
if len(signature(model_D.forward).parameters) == 2:
|
||||
|
@ -349,8 +356,7 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch)
|
|||
_, feats_real = D_out_real
|
||||
else:
|
||||
scores_fake = D_out_fake
|
||||
else:
|
||||
scores_fake, feats_fake, feats_real = None, None, None
|
||||
feats_fake, feats_real = None, None
|
||||
|
||||
# compute losses
|
||||
loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake,
|
||||
|
@ -615,7 +621,7 @@ if __name__ == '__main__':
|
|||
|
||||
# setup output paths and read configs
|
||||
c = load_config(args.config_path)
|
||||
check_config(c)
|
||||
# check_config(c)
|
||||
_ = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
OUT_PATH = args.continue_path
|
|
@ -15,7 +15,7 @@ If you have the environment set already for TTS, then you can directly call ```s
|
|||
3. source /tmp/venv/bin/activate
|
||||
4. pip install -U pip setuptools wheel
|
||||
5. pip install -U https//example.com/url/to/python/package.whl
|
||||
6. python -m TTS.server.server
|
||||
6. python -m mozilla_voice_tts.server.server
|
||||
|
||||
You can now open http://localhost:5002 in a browser
|
||||
|
|
@ -3,7 +3,7 @@ import argparse
|
|||
import os
|
||||
|
||||
from flask import Flask, request, render_template, send_file
|
||||
from TTS.server.synthesizer import Synthesizer
|
||||
from mozilla_voice_tts.server.synthesizer import Synthesizer
|
||||
|
||||
|
||||
def create_argparser():
|
||||
|
@ -18,8 +18,8 @@ def create_argparser():
|
|||
parser.add_argument('--wavernn_checkpoint', type=str, default=None, help='path to WaveRNN checkpoint file.')
|
||||
parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.')
|
||||
parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.')
|
||||
parser.add_argument('--vocoder_config', type=str, default=None, help='path to TTS.vocoder config file.')
|
||||
parser.add_argument('--vocoder_checkpoint', type=str, default=None, help='path to TTS.vocoder checkpoint file.')
|
||||
parser.add_argument('--vocoder_config', type=str, default=None, help='path to mozilla_voice_tts.vocoder config file.')
|
||||
parser.add_argument('--vocoder_checkpoint', type=str, default=None, help='path to mozilla_voice_tts.vocoder checkpoint file.')
|
||||
parser.add_argument('--port', type=int, default=5002, help='port to listen on.')
|
||||
parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.')
|
||||
parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.')
|
|
@ -4,19 +4,18 @@ import time
|
|||
|
||||
import numpy as np
|
||||
import torch
|
||||
import yaml
|
||||
import pysbd
|
||||
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.utils.generic_utils import setup_model
|
||||
from TTS.utils.speakers import load_speaker_mapping
|
||||
from TTS.vocoder.utils.generic_utils import setup_generator
|
||||
from mozilla_voice_tts.utils.audio import AudioProcessor
|
||||
from mozilla_voice_tts.utils.io import load_config
|
||||
from mozilla_voice_tts.tts.utils.generic_utils import setup_model
|
||||
from mozilla_voice_tts.tts.utils.speakers import load_speaker_mapping
|
||||
from mozilla_voice_tts.vocoder.utils.generic_utils import setup_generator
|
||||
# pylint: disable=unused-wildcard-import
|
||||
# pylint: disable=wildcard-import
|
||||
from TTS.utils.synthesis import *
|
||||
from mozilla_voice_tts.tts.utils.synthesis import *
|
||||
|
||||
from TTS.utils.text import make_symbols, phonemes, symbols
|
||||
from mozilla_voice_tts.tts.utils.text import make_symbols, phonemes, symbols
|
||||
|
||||
|
||||
class Synthesizer(object):
|
|
@ -10,7 +10,7 @@ Below is an example showing embedding results of various speakers. You can gener
|
|||
|
||||
Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
|
||||
|
||||
To run the code, you need to follow the same flow as in TTS.
|
||||
To run the code, you need to follow the same flow as in mozilla_voice_tts.
|
||||
|
||||
- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
|
||||
- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
|
|
@ -6,9 +6,9 @@ import numpy as np
|
|||
from tqdm import tqdm
|
||||
|
||||
import torch
|
||||
from TTS.speaker_encoder.model import SpeakerEncoder
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.generic_utils import load_config
|
||||
from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder
|
||||
from mozilla_voice_tts.tts.utils.audio import AudioProcessor
|
||||
from mozilla_voice_tts.tts.utils.generic_utils import load_config
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Compute embedding vectors for each wav file in a dataset. ')
|
|
@ -0,0 +1,61 @@
|
|||
|
||||
{
|
||||
"run_name": "Model compatible to CorentinJ/Real-Time-Voice-Cloning",
|
||||
"run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ",
|
||||
"audio":{
|
||||
// Audio processing parameters
|
||||
"num_mels": 40, // size of the mel spec frame.
|
||||
"fft_size": 400, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
||||
"win_length": 400, // stft window length in ms.
|
||||
"hop_length": 160, // stft window hop-lengh in ms.
|
||||
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
||||
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
||||
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"min_level_db": -100, // normalization range
|
||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
||||
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize the spec values in range [0, 1]
|
||||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
||||
"trim_db": 60 // threshold for timming silence. Set this according to your dataset.
|
||||
},
|
||||
"reinit_layers": [],
|
||||
"loss": "ge2e", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
|
||||
"grad_clip": 3.0, // upper limit for gradients for clipping.
|
||||
"epochs": 1000, // total number of epochs to train.
|
||||
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"lr_decay": false, // if true, Noam learning rate decaying is applied through training.
|
||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
"steps_plot_stats": 10, // number of steps to plot embeddings.
|
||||
"num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"wd": 0.000001, // Weight decay weight.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
|
||||
"print_step": 1, // Number of steps to log traning on console.
|
||||
"output_path": "../../checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
|
||||
"model": {
|
||||
"input_dim": 40,
|
||||
"proj_dim": 256,
|
||||
"lstm_dim": 256,
|
||||
"num_lstm_layers": 3,
|
||||
"use_lstm_with_projection": false
|
||||
},
|
||||
"datasets":
|
||||
[
|
||||
{
|
||||
"name": "vctk",
|
||||
"path": "../../../datasets/VCTK-Corpus-removed-silence/",
|
||||
"meta_file_train": null,
|
||||
"meta_file_val": null
|
||||
}
|
||||
]
|
||||
}
|
|
@ -9,7 +9,7 @@ class MyDataset(Dataset):
|
|||
num_utter_per_speaker=10, skip_speakers=False, verbose=False):
|
||||
"""
|
||||
Args:
|
||||
ap (TTS.utils.AudioProcessor): audio processor object.
|
||||
ap (mozilla_voice_tts.tts.utils.AudioProcessor): audio processor object.
|
||||
meta_data (list): list of dataset instances.
|
||||
seq_len (int): voice segment length in seconds.
|
||||
verbose (bool): print diagnostic information.
|
||||
|
@ -31,7 +31,7 @@ class MyDataset(Dataset):
|
|||
print(f" | > Num speakers: {len(self.speakers)}")
|
||||
|
||||
def load_wav(self, filename):
|
||||
audio = self.ap.load_wav(filename)
|
||||
audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
|
||||
return audio
|
||||
|
||||
def load_data(self, idx):
|
|
@ -15,7 +15,7 @@ def save_checkpoint(model, optimizer, model_loss, out_path,
|
|||
'optimizer': optimizer.state_dict() if optimizer is not None else None,
|
||||
'step': current_step,
|
||||
'epoch': epoch,
|
||||
'GE2Eloss': model_loss,
|
||||
'loss': model_loss,
|
||||
'date': datetime.date.today().strftime("%B %d, %Y"),
|
||||
}
|
||||
torch.save(state, checkpoint_path)
|
||||
|
@ -29,7 +29,7 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path,
|
|||
'model': new_state_dict,
|
||||
'optimizer': optimizer.state_dict(),
|
||||
'step': current_step,
|
||||
'GE2Eloss': model_loss,
|
||||
'loss': model_loss,
|
||||
'date': datetime.date.today().strftime("%B %d, %Y"),
|
||||
}
|
||||
best_loss = model_loss
|
||||
|
@ -38,4 +38,4 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path,
|
|||
print("\n > BEST MODEL ({0:.5f}) : {1:}".format(
|
||||
model_loss, bestmodel_path))
|
||||
torch.save(state, bestmodel_path)
|
||||
return best_loss
|
||||
return best_loss
|
|
@ -1,7 +1,7 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
import numpy as np
|
||||
|
||||
# adapted from https://github.com/cvqluu/GE2E-Loss
|
||||
class GE2ELoss(nn.Module):
|
||||
|
@ -23,6 +23,8 @@ class GE2ELoss(nn.Module):
|
|||
self.b = nn.Parameter(torch.tensor(init_b))
|
||||
self.loss_method = loss_method
|
||||
|
||||
print(' > Initialised Generalized End-to-End loss')
|
||||
|
||||
assert self.loss_method in ["softmax", "contrast"]
|
||||
|
||||
if self.loss_method == "softmax":
|
||||
|
@ -119,3 +121,40 @@ class GE2ELoss(nn.Module):
|
|||
cos_sim_matrix = self.w * cos_sim_matrix + self.b
|
||||
L = self.embed_loss(dvecs, cos_sim_matrix)
|
||||
return L.mean()
|
||||
|
||||
# adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py
|
||||
class AngleProtoLoss(nn.Module):
|
||||
"""
|
||||
Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982
|
||||
Accepts an input of size (N, M, D)
|
||||
where N is the number of speakers in the batch,
|
||||
M is the number of utterances per speaker,
|
||||
and D is the dimensionality of the embedding vector
|
||||
Args:
|
||||
- init_w (float): defines the initial value of w
|
||||
- init_b (float): definies the initial value of b
|
||||
"""
|
||||
def __init__(self, init_w=10.0, init_b=-5.0):
|
||||
super(AngleProtoLoss, self).__init__()
|
||||
# pylint: disable=E1102
|
||||
self.w = nn.Parameter(torch.tensor(init_w))
|
||||
# pylint: disable=E1102
|
||||
self.b = nn.Parameter(torch.tensor(init_b))
|
||||
self.criterion = torch.nn.CrossEntropyLoss()
|
||||
|
||||
print(' > Initialised Angular Prototypical loss')
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
|
||||
"""
|
||||
out_anchor = torch.mean(x[:, 1:, :], 1)
|
||||
out_positive = x[:, 0, :]
|
||||
num_speakers = out_anchor.size()[0]
|
||||
|
||||
cos_sim_matrix = F.cosine_similarity(out_positive.unsqueeze(-1).expand(-1, -1, num_speakers), out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2))
|
||||
torch.clamp(self.w, 1e-6)
|
||||
cos_sim_matrix = cos_sim_matrix * self.w + self.b
|
||||
label = torch.from_numpy(np.asarray(range(0, num_speakers))).to(cos_sim_matrix.device)
|
||||
L = self.criterion(cos_sim_matrix, label)
|
||||
return L
|
|
@ -16,15 +16,33 @@ class LSTMWithProjection(nn.Module):
|
|||
o, (_, _) = self.lstm(x)
|
||||
return self.linear(o)
|
||||
|
||||
class LSTMWithoutProjection(nn.Module):
|
||||
def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
|
||||
super().__init__()
|
||||
self.lstm = nn.LSTM(input_size=input_dim,
|
||||
hidden_size=lstm_dim,
|
||||
num_layers=num_lstm_layers,
|
||||
batch_first=True)
|
||||
self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
|
||||
self.relu = nn.ReLU()
|
||||
def forward(self, x):
|
||||
_, (hidden, _) = self.lstm(x)
|
||||
return self.relu(self.linear(hidden[-1]))
|
||||
|
||||
class SpeakerEncoder(nn.Module):
|
||||
def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3):
|
||||
def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3, use_lstm_with_projection=True):
|
||||
super().__init__()
|
||||
self.use_lstm_with_projection = use_lstm_with_projection
|
||||
layers = []
|
||||
layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
|
||||
for _ in range(num_lstm_layers - 1):
|
||||
layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
|
||||
self.layers = nn.Sequential(*layers)
|
||||
# choise LSTM layer
|
||||
if use_lstm_with_projection:
|
||||
layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
|
||||
for _ in range(num_lstm_layers - 1):
|
||||
layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
|
||||
self.layers = nn.Sequential(*layers)
|
||||
else:
|
||||
self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
|
||||
|
||||
self._init_layers()
|
||||
|
||||
def _init_layers(self):
|
||||
|
@ -37,12 +55,18 @@ class SpeakerEncoder(nn.Module):
|
|||
def forward(self, x):
|
||||
# TODO: implement state passing for lstms
|
||||
d = self.layers(x)
|
||||
d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
|
||||
if self.use_lstm_with_projection:
|
||||
d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
|
||||
else:
|
||||
d = torch.nn.functional.normalize(d, p=2, dim=1)
|
||||
return d
|
||||
|
||||
def inference(self, x):
|
||||
d = self.layers.forward(x)
|
||||
d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
|
||||
if self.use_lstm_with_projection:
|
||||
d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1)
|
||||
else:
|
||||
d = torch.nn.functional.normalize(d, p=2, dim=1)
|
||||
return d
|
||||
|
||||
def compute_embedding(self, x, num_frames=160, overlap=0.5):
|
||||
|
@ -85,4 +109,3 @@ class SpeakerEncoder(nn.Module):
|
|||
frames[cur_iter <= num_iters, :, :]
|
||||
)
|
||||
return embed / num_iters
|
||||
|
Before Width: | Height: | Size: 24 KiB After Width: | Height: | Size: 24 KiB |
|
@ -67,6 +67,7 @@
|
|||
"gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
|
||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||
"ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled.
|
||||
"apex_amp_level": null, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate.
|
||||
|
||||
// VALIDATION
|
||||
"run_eval": true,
|
||||
|
@ -84,8 +85,8 @@
|
|||
|
||||
// TACOTRON PRENET
|
||||
"memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
|
||||
"prenet_type": "bn", // "original" or "bn".
|
||||
"prenet_dropout": false, // enable/disable dropout at prenet.
|
||||
"prenet_type": "bn", // "original" or "bn".
|
||||
"prenet_dropout": false, // enable/disable dropout at prenet.
|
||||
|
||||
// TACOTRON ATTENTION
|
||||
"attention_type": "original", // 'original' or 'graves'
|
||||
|
@ -122,33 +123,35 @@
|
|||
"max_seq_len": 153, // DATASET-RELATED: maximum text length
|
||||
|
||||
// PATHS
|
||||
"output_path": "/home/erogol/Models/LJSpeech/",
|
||||
"output_path": "../../Mozilla-TTS/vctk-test/",
|
||||
|
||||
// PHONEMES
|
||||
"phoneme_cache_path": "/media/erogol/data_ssd2/mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||
"phoneme_cache_path": "../../Mozilla-TTS/vctk-test/", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||
|
||||
// MULTI-SPEAKER and GST
|
||||
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
|
||||
"use_speaker_embedding": true, // use speaker embedding to enable multi-speaker learning.
|
||||
"use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
|
||||
"external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
|
||||
"use_gst": true, // use global style tokens
|
||||
"gst": { // gst parameter if gst is enabled
|
||||
"gst_style_input": null, // Condition the style input either on a
|
||||
// -> wave file [path to wave] or
|
||||
// -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
|
||||
"gst_style_input": null, // Condition the style input either on a
|
||||
// -> wave file [path to wave] or
|
||||
// -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
|
||||
// with the dictionary being len(dict) <= len(gst_style_tokens).
|
||||
"gst_embedding_dim": 512,
|
||||
"gst_embedding_dim": 512,
|
||||
"gst_num_heads": 4,
|
||||
"gst_style_tokens": 10
|
||||
},
|
||||
},
|
||||
|
||||
// DATASETS
|
||||
"datasets": // List of datasets. They all merged and they get different speaker_ids.
|
||||
[
|
||||
{
|
||||
"name": "ljspeech",
|
||||
"path": "/home/erogol/Data/LJSpeech-1.1/",
|
||||
"meta_file_train": "metadata.csv",
|
||||
"name": "vctk",
|
||||
"path": "../../../datasets/VCTK-Corpus-removed-silence/",
|
||||
"meta_file_train": ["p225", "p234", "p238", "p245", "p248", "p261", "p294", "p302", "p326", "p335", "p347"], // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers
|
||||
"meta_file_val": null
|
||||
}
|
||||
]
|
|
@ -5,8 +5,8 @@ import torch
|
|||
import random
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
from TTS.utils.text import text_to_sequence, phoneme_to_sequence, pad_with_eos_bos
|
||||
from TTS.utils.data import prepare_data, prepare_tensor, prepare_stop_target
|
||||
from mozilla_voice_tts.tts.utils.text import text_to_sequence, phoneme_to_sequence, pad_with_eos_bos
|
||||
from mozilla_voice_tts.tts.utils.data import prepare_data, prepare_tensor, prepare_stop_target
|
||||
|
||||
|
||||
class MyDataset(Dataset):
|
||||
|
@ -24,13 +24,14 @@ class MyDataset(Dataset):
|
|||
phoneme_cache_path=None,
|
||||
phoneme_language="en-us",
|
||||
enable_eos_bos=False,
|
||||
speaker_mapping=None,
|
||||
verbose=False):
|
||||
"""
|
||||
Args:
|
||||
outputs_per_step (int): number of time frames predicted per step.
|
||||
text_cleaner (str): text cleaner used for the dataset.
|
||||
compute_linear_spec (bool): compute linear spectrogram if True.
|
||||
ap (TTS.utils.AudioProcessor): audio processor object.
|
||||
ap (mozilla_voice_tts.tts.utils.AudioProcessor): audio processor object.
|
||||
meta_data (list): list of dataset instances.
|
||||
batch_group_size (int): (0) range of batch randomization after sorting
|
||||
sequences by length.
|
||||
|
@ -58,6 +59,7 @@ class MyDataset(Dataset):
|
|||
self.phoneme_cache_path = phoneme_cache_path
|
||||
self.phoneme_language = phoneme_language
|
||||
self.enable_eos_bos = enable_eos_bos
|
||||
self.speaker_mapping = speaker_mapping
|
||||
self.verbose = verbose
|
||||
if use_phonemes and not os.path.isdir(phoneme_cache_path):
|
||||
os.makedirs(phoneme_cache_path, exist_ok=True)
|
||||
|
@ -127,7 +129,8 @@ class MyDataset(Dataset):
|
|||
'text': text,
|
||||
'wav': wav,
|
||||
'item_idx': self.items[idx][1],
|
||||
'speaker_name': speaker_name
|
||||
'speaker_name': speaker_name,
|
||||
'wav_file_name': os.path.basename(wav_file)
|
||||
}
|
||||
return sample
|
||||
|
||||
|
@ -191,9 +194,15 @@ class MyDataset(Dataset):
|
|||
batch[idx]['item_idx'] for idx in ids_sorted_decreasing
|
||||
]
|
||||
text = [batch[idx]['text'] for idx in ids_sorted_decreasing]
|
||||
|
||||
speaker_name = [batch[idx]['speaker_name']
|
||||
for idx in ids_sorted_decreasing]
|
||||
|
||||
# get speaker embeddings
|
||||
if self.speaker_mapping is not None:
|
||||
wav_files_names = [batch[idx]['wav_file_name'] for idx in ids_sorted_decreasing]
|
||||
speaker_embedding = [self.speaker_mapping[w]['embedding'] for w in wav_files_names]
|
||||
else:
|
||||
speaker_embedding = None
|
||||
# compute features
|
||||
mel = [self.ap.melspectrogram(w).astype('float32') for w in wav]
|
||||
|
||||
|
@ -224,6 +233,9 @@ class MyDataset(Dataset):
|
|||
mel_lengths = torch.LongTensor(mel_lengths)
|
||||
stop_targets = torch.FloatTensor(stop_targets)
|
||||
|
||||
if speaker_embedding is not None:
|
||||
speaker_embedding = torch.FloatTensor(speaker_embedding)
|
||||
|
||||
# compute linear spectrogram
|
||||
if self.compute_linear_spec:
|
||||
linear = [self.ap.spectrogram(w).astype('float32') for w in wav]
|
||||
|
@ -234,7 +246,7 @@ class MyDataset(Dataset):
|
|||
else:
|
||||
linear = None
|
||||
return text, text_lenghts, speaker_name, linear, mel, mel_lengths, \
|
||||
stop_targets, item_idxs
|
||||
stop_targets, item_idxs, speaker_embedding
|
||||
|
||||
raise TypeError(("batch must contain tensors, numbers, dicts or lists;\
|
||||
found {}".format(type(batch[0]))))
|
|
@ -2,7 +2,7 @@ import os
|
|||
from glob import glob
|
||||
import re
|
||||
import sys
|
||||
from TTS.utils.generic_utils import split_dataset
|
||||
from mozilla_voice_tts.tts.utils.generic_utils import split_dataset
|
||||
|
||||
|
||||
def load_meta_data(datasets):
|
||||
|
@ -93,9 +93,10 @@ def mozilla_de(root_path, meta_file):
|
|||
|
||||
def mailabs(root_path, meta_files=None):
|
||||
"""Normalizes M-AI-Labs meta data files to TTS format"""
|
||||
speaker_regex = re.compile("by_book/(male|female)/(?P<speaker_name>[^/]+)/")
|
||||
speaker_regex = re.compile(
|
||||
"by_book/(male|female)/(?P<speaker_name>[^/]+)/")
|
||||
if meta_files is None:
|
||||
csv_files = glob(root_path+"/**/metadata.csv", recursive=True)
|
||||
csv_files = glob(root_path + "/**/metadata.csv", recursive=True)
|
||||
else:
|
||||
csv_files = meta_files
|
||||
# meta_files = [f.strip() for f in meta_files.split(",")]
|
||||
|
@ -115,12 +116,15 @@ def mailabs(root_path, meta_files=None):
|
|||
if meta_files is None:
|
||||
wav_file = os.path.join(folder, 'wavs', cols[0] + '.wav')
|
||||
else:
|
||||
wav_file = os.path.join(root_path, folder.replace("metadata.csv", ""), 'wavs', cols[0] + '.wav')
|
||||
wav_file = os.path.join(root_path,
|
||||
folder.replace("metadata.csv", ""),
|
||||
'wavs', cols[0] + '.wav')
|
||||
if os.path.isfile(wav_file):
|
||||
text = cols[1].strip()
|
||||
items.append([text, wav_file, speaker_name])
|
||||
else:
|
||||
raise RuntimeError("> File %s does not exist!"%(wav_file))
|
||||
raise RuntimeError("> File %s does not exist!" %
|
||||
(wav_file))
|
||||
return items
|
||||
|
||||
|
||||
|
@ -185,7 +189,8 @@ def libri_tts(root_path, meta_files=None):
|
|||
text = cols[1]
|
||||
items.append([text, wav_file, speaker_name])
|
||||
for item in items:
|
||||
assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}"
|
||||
assert os.path.exists(
|
||||
item[1]), f" [!] wav files don't exist - {item[1]}"
|
||||
return items
|
||||
|
||||
|
||||
|
@ -197,7 +202,8 @@ def custom_turkish(root_path, meta_file):
|
|||
with open(txt_file, 'r', encoding='utf-8') as ttf:
|
||||
for line in ttf:
|
||||
cols = line.split('|')
|
||||
wav_file = os.path.join(root_path, 'wavs', cols[0].strip() + '.wav')
|
||||
wav_file = os.path.join(root_path, 'wavs',
|
||||
cols[0].strip() + '.wav')
|
||||
if not os.path.exists(wav_file):
|
||||
skipped_files.append(wav_file)
|
||||
continue
|
||||
|
@ -205,3 +211,44 @@ def custom_turkish(root_path, meta_file):
|
|||
items.append([text, wav_file, speaker_name])
|
||||
print(f" [!] {len(skipped_files)} files skipped. They don't exist...")
|
||||
return items
|
||||
|
||||
|
||||
# ToDo: add the dataset link when the dataset is released publicly
|
||||
def brspeech(root_path, meta_file):
|
||||
'''BRSpeech 3.0 beta'''
|
||||
txt_file = os.path.join(root_path, meta_file)
|
||||
items = []
|
||||
with open(txt_file, 'r') as ttf:
|
||||
for line in ttf:
|
||||
if line.startswith("wav_filename"):
|
||||
continue
|
||||
cols = line.split('|')
|
||||
#print(cols)
|
||||
wav_file = os.path.join(root_path, cols[0])
|
||||
text = cols[2]
|
||||
speaker_name = cols[3]
|
||||
items.append([text, wav_file, speaker_name])
|
||||
return items
|
||||
|
||||
|
||||
def vctk(root_path, meta_files=None, wavs_path='wav48'):
|
||||
"""homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz"""
|
||||
test_speakers = meta_files
|
||||
items = []
|
||||
meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt",
|
||||
recursive=True)
|
||||
for meta_file in meta_files:
|
||||
_, speaker_id, txt_file = os.path.relpath(meta_file,
|
||||
root_path).split(os.sep)
|
||||
file_id = txt_file.split('.')[0]
|
||||
if isinstance(test_speakers,
|
||||
list): # if is list ignore this speakers ids
|
||||
if speaker_id in test_speakers:
|
||||
continue
|
||||
with open(meta_file) as file_text:
|
||||
text = file_text.readlines()[0]
|
||||
wav_file = os.path.join(root_path, wavs_path, speaker_id,
|
||||
file_id + '.wav')
|
||||
items.append([text, wav_file, speaker_id])
|
||||
|
||||
return items
|
|
@ -1,6 +1,5 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
from torch.autograd import Variable
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
||||
|
@ -52,6 +51,7 @@ class LinearBN(nn.Module):
|
|||
|
||||
|
||||
class Prenet(nn.Module):
|
||||
# pylint: disable=dangerous-default-value
|
||||
def __init__(self,
|
||||
in_features,
|
||||
prenet_type="original",
|
||||
|
@ -244,14 +244,14 @@ class OriginalAttention(nn.Module):
|
|||
self.u = (0.5 * torch.ones([B, 1])).to(inputs.device)
|
||||
|
||||
def init_location_attention(self, inputs):
|
||||
B = inputs.shape[0]
|
||||
T = inputs.shape[1]
|
||||
self.attention_weights_cum = Variable(inputs.data.new(B, T).zero_())
|
||||
B = inputs.size(0)
|
||||
T = inputs.size(1)
|
||||
self.attention_weights_cum = torch.zeros([B, T], device=inputs.device)
|
||||
|
||||
def init_states(self, inputs):
|
||||
B = inputs.shape[0]
|
||||
T = inputs.shape[1]
|
||||
self.attention_weights = Variable(inputs.data.new(B, T).zero_())
|
||||
B = inputs.size(0)
|
||||
T = inputs.size(1)
|
||||
self.attention_weights = torch.zeros([B, T], device=inputs.device)
|
||||
if self.location_attention:
|
||||
self.init_location_attention(inputs)
|
||||
if self.forward_attn:
|
||||
|
@ -300,8 +300,8 @@ class OriginalAttention(nn.Module):
|
|||
|
||||
def apply_forward_attention(self, alignment):
|
||||
# forward attention
|
||||
fwd_shifted_alpha = F.pad(self.alpha[:, :-1].clone().to(alignment.device),
|
||||
(1, 0, 0, 0))
|
||||
fwd_shifted_alpha = F.pad(
|
||||
self.alpha[:, :-1].clone().to(alignment.device), (1, 0, 0, 0))
|
||||
# compute transition potentials
|
||||
alpha = ((1 - self.u) * self.alpha
|
||||
+ self.u * fwd_shifted_alpha
|
||||
|
@ -309,7 +309,7 @@ class OriginalAttention(nn.Module):
|
|||
# force incremental alignment
|
||||
if not self.training and self.forward_attn_mask:
|
||||
_, n = fwd_shifted_alpha.max(1)
|
||||
val, n2 = alpha.max(1)
|
||||
val, _ = alpha.max(1)
|
||||
for b in range(alignment.shape[0]):
|
||||
alpha[b, n[b] + 3:] = 0
|
||||
alpha[b, :(
|
|
@ -72,7 +72,7 @@ class ReferenceEncoder(nn.Module):
|
|||
# x: 3D tensor [batch_size, post_conv_width,
|
||||
# num_channels*post_conv_height]
|
||||
self.recurrence.flatten_parameters()
|
||||
memory, out = self.recurrence(x)
|
||||
_, out = self.recurrence(x)
|
||||
# out: 3D tensor [seq_len==1, batch_size, encoding_size=128]
|
||||
|
||||
return out.squeeze(0)
|
||||
|
@ -96,7 +96,7 @@ class StyleTokenLayer(nn.Module):
|
|||
self.key_dim = embedding_dim // num_heads
|
||||
self.style_tokens = nn.Parameter(
|
||||
torch.FloatTensor(num_style_tokens, self.key_dim))
|
||||
nn.init.orthogonal_(self.style_tokens)
|
||||
nn.init.normal_(self.style_tokens, mean=0, std=0.5)
|
||||
self.attention = MultiHeadAttention(
|
||||
query_dim=self.query_dim,
|
||||
key_dim=self.key_dim,
|
|
@ -2,7 +2,7 @@ import numpy as np
|
|||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional
|
||||
from TTS.utils.generic_utils import sequence_mask
|
||||
from mozilla_voice_tts.tts.utils.generic_utils import sequence_mask
|
||||
|
||||
|
||||
class L1LossMasked(nn.Module):
|
||||
|
@ -150,7 +150,7 @@ class GuidedAttentionLoss(torch.nn.Module):
|
|||
|
||||
@staticmethod
|
||||
def _make_ga_mask(ilen, olen, sigma):
|
||||
grid_x, grid_y = torch.meshgrid(torch.arange(olen), torch.arange(ilen))
|
||||
grid_x, grid_y = torch.meshgrid(torch.arange(olen, device=olen.device), torch.arange(ilen, device=ilen.device))
|
||||
grid_x, grid_y = grid_x.float(), grid_y.float()
|
||||
return 1.0 - torch.exp(-(grid_y / ilen - grid_x / olen) ** 2 / (2 * (sigma ** 2)))
|
||||
|
||||
|
@ -243,4 +243,3 @@ class TacotronLoss(torch.nn.Module):
|
|||
|
||||
return_dict['loss'] = loss
|
||||
return return_dict
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
# coding: utf-8
|
||||
import torch
|
||||
from torch import nn
|
||||
from .common_layers import Prenet, init_attn, Linear
|
||||
from .common_layers import Prenet, init_attn
|
||||
|
||||
|
||||
class BatchNormConv1d(nn.Module):
|
||||
|
@ -18,8 +18,8 @@ class BatchNormConv1d(nn.Module):
|
|||
activation: activation function set b/w Conv1d and BatchNorm
|
||||
|
||||
Shapes:
|
||||
- input: batch x dims
|
||||
- output: batch x dims
|
||||
- input: (B, D)
|
||||
- output: (B, D)
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
@ -46,9 +46,9 @@ class BatchNormConv1d(nn.Module):
|
|||
# self.init_layers()
|
||||
|
||||
def init_layers(self):
|
||||
if type(self.activation) == torch.nn.ReLU:
|
||||
if isinstance(self.activation, torch.nn.ReLU):
|
||||
w_gain = 'relu'
|
||||
elif type(self.activation) == torch.nn.Tanh:
|
||||
elif isinstance(self.activation, torch.nn.Tanh):
|
||||
w_gain = 'tanh'
|
||||
elif self.activation is None:
|
||||
w_gain = 'linear'
|
||||
|
@ -67,12 +67,23 @@ class BatchNormConv1d(nn.Module):
|
|||
|
||||
|
||||
class Highway(nn.Module):
|
||||
r"""Highway layers as explained in https://arxiv.org/abs/1505.00387
|
||||
|
||||
Args:
|
||||
in_features (int): size of each input sample
|
||||
out_feature (int): size of each output sample
|
||||
|
||||
Shapes:
|
||||
- input: (B, *, H_in)
|
||||
- output: (B, *, H_out)
|
||||
"""
|
||||
|
||||
# TODO: Try GLU layer
|
||||
def __init__(self, in_size, out_size):
|
||||
def __init__(self, in_features, out_feature):
|
||||
super(Highway, self).__init__()
|
||||
self.H = nn.Linear(in_size, out_size)
|
||||
self.H = nn.Linear(in_features, out_feature)
|
||||
self.H.bias.data.zero_()
|
||||
self.T = nn.Linear(in_size, out_size)
|
||||
self.T = nn.Linear(in_features, out_feature)
|
||||
self.T.bias.data.fill_(-1)
|
||||
self.relu = nn.ReLU()
|
||||
self.sigmoid = nn.Sigmoid()
|
||||
|
@ -103,10 +114,10 @@ class CBHG(nn.Module):
|
|||
num_highways (int): number of highways layers
|
||||
|
||||
Shapes:
|
||||
- input: B x D x T_in
|
||||
- output: B x T_in x D*2
|
||||
- input: (B, C, T_in)
|
||||
- output: (B, T_in, C*2)
|
||||
"""
|
||||
|
||||
#pylint: disable=dangerous-default-value
|
||||
def __init__(self,
|
||||
in_features,
|
||||
K=16,
|
||||
|
@ -195,6 +206,8 @@ class CBHG(nn.Module):
|
|||
|
||||
|
||||
class EncoderCBHG(nn.Module):
|
||||
r"""CBHG module with Encoder specific arguments"""
|
||||
|
||||
def __init__(self):
|
||||
super(EncoderCBHG, self).__init__()
|
||||
self.cbhg = CBHG(
|
||||
|
@ -211,7 +224,14 @@ class EncoderCBHG(nn.Module):
|
|||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
r"""Encapsulate Prenet and CBHG modules for encoder"""
|
||||
r"""Stack Prenet and CBHG module for encoder
|
||||
Args:
|
||||
inputs (FloatTensor): embedding features
|
||||
|
||||
Shapes:
|
||||
- inputs: (B, T, D_in)
|
||||
- outputs: (B, T, 128 * 2)
|
||||
"""
|
||||
|
||||
def __init__(self, in_features):
|
||||
super(Encoder, self).__init__()
|
||||
|
@ -219,14 +239,6 @@ class Encoder(nn.Module):
|
|||
self.cbhg = EncoderCBHG()
|
||||
|
||||
def forward(self, inputs):
|
||||
r"""
|
||||
Args:
|
||||
inputs (FloatTensor): embedding features
|
||||
|
||||
Shapes:
|
||||
- inputs: batch x time x in_features
|
||||
- outputs: batch x time x 128*2
|
||||
"""
|
||||
# B x T x prenet_dim
|
||||
outputs = self.prenet(inputs)
|
||||
outputs = self.cbhg(outputs.transpose(1, 2))
|
||||
|
@ -250,35 +262,48 @@ class PostCBHG(nn.Module):
|
|||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
"""Decoder module.
|
||||
"""Tacotron decoder.
|
||||
|
||||
Args:
|
||||
in_features (int): input vector (encoder output) sample size.
|
||||
memory_dim (int): memory vector (prev. time-step output) sample size.
|
||||
r (int): number of outputs per time step.
|
||||
in_channels (int): number of input channels.
|
||||
frame_channels (int): number of feature frame channels.
|
||||
r (int): number of outputs per time step (reduction rate).
|
||||
memory_size (int): size of the past window. if <= 0 memory_size = r
|
||||
TODO: arguments
|
||||
attn_type (string): type of attention used in decoder.
|
||||
attn_windowing (bool): if true, define an attention window centered to maximum
|
||||
attention response. It provides more robust attention alignment especially
|
||||
at interence time.
|
||||
attn_norm (string): attention normalization function. 'sigmoid' or 'softmax'.
|
||||
prenet_type (string): 'original' or 'bn'.
|
||||
prenet_dropout (float): prenet dropout rate.
|
||||
forward_attn (bool): if true, use forward attention method. https://arxiv.org/abs/1807.06736
|
||||
trans_agent (bool): if true, use transition agent. https://arxiv.org/abs/1807.06736
|
||||
forward_attn_mask (bool): if true, mask attention values smaller than a threshold.
|
||||
location_attn (bool): if true, use location sensitive attention.
|
||||
attn_K (int): number of attention heads for GravesAttention.
|
||||
separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow.
|
||||
speaker_embedding_dim (int): size of speaker embedding vector, for multi-speaker training.
|
||||
"""
|
||||
|
||||
# Pylint gets confused by PyTorch conventions here
|
||||
#pylint: disable=attribute-defined-outside-init
|
||||
# pylint: disable=attribute-defined-outside-init
|
||||
|
||||
def __init__(self, in_features, memory_dim, r, memory_size, attn_type, attn_windowing,
|
||||
def __init__(self, in_channels, frame_channels, r, memory_size, attn_type, attn_windowing,
|
||||
attn_norm, prenet_type, prenet_dropout, forward_attn,
|
||||
trans_agent, forward_attn_mask, location_attn, attn_K,
|
||||
separate_stopnet, speaker_embedding_dim):
|
||||
separate_stopnet):
|
||||
super(Decoder, self).__init__()
|
||||
self.r_init = r
|
||||
self.r = r
|
||||
self.in_features = in_features
|
||||
self.in_channels = in_channels
|
||||
self.max_decoder_steps = 500
|
||||
self.use_memory_queue = memory_size > 0
|
||||
self.memory_size = memory_size if memory_size > 0 else r
|
||||
self.memory_dim = memory_dim
|
||||
self.frame_channels = frame_channels
|
||||
self.separate_stopnet = separate_stopnet
|
||||
self.query_dim = 256
|
||||
# memory -> |Prenet| -> processed_memory
|
||||
prenet_dim = memory_dim * self.memory_size + speaker_embedding_dim if self.use_memory_queue else memory_dim + speaker_embedding_dim
|
||||
prenet_dim = frame_channels * self.memory_size if self.use_memory_queue else frame_channels
|
||||
self.prenet = Prenet(
|
||||
prenet_dim,
|
||||
prenet_type,
|
||||
|
@ -286,11 +311,11 @@ class Decoder(nn.Module):
|
|||
out_features=[256, 128])
|
||||
# processed_inputs, processed_memory -> |Attention| -> Attention, attention, RNN_State
|
||||
# attention_rnn generates queries for the attention mechanism
|
||||
self.attention_rnn = nn.GRUCell(in_features + 128, self.query_dim)
|
||||
self.attention_rnn = nn.GRUCell(in_channels + 128, self.query_dim)
|
||||
|
||||
self.attention = init_attn(attn_type=attn_type,
|
||||
query_dim=self.query_dim,
|
||||
embedding_dim=in_features,
|
||||
embedding_dim=in_channels,
|
||||
attention_dim=128,
|
||||
location_attention=location_attn,
|
||||
attention_location_n_filters=32,
|
||||
|
@ -302,14 +327,14 @@ class Decoder(nn.Module):
|
|||
forward_attn_mask=forward_attn_mask,
|
||||
attn_K=attn_K)
|
||||
# (processed_memory | attention context) -> |Linear| -> decoder_RNN_input
|
||||
self.project_to_decoder_in = nn.Linear(256 + in_features, 256)
|
||||
self.project_to_decoder_in = nn.Linear(256 + in_channels, 256)
|
||||
# decoder_RNN_input -> |RNN| -> RNN_state
|
||||
self.decoder_rnns = nn.ModuleList(
|
||||
[nn.GRUCell(256, 256) for _ in range(2)])
|
||||
# RNN_state -> |Linear| -> mel_spec
|
||||
self.proj_to_mel = nn.Linear(256, memory_dim * self.r_init)
|
||||
self.proj_to_mel = nn.Linear(256, frame_channels * self.r_init)
|
||||
# learn init values instead of zero init.
|
||||
self.stopnet = StopNet(256 + memory_dim * self.r_init)
|
||||
self.stopnet = StopNet(256 + frame_channels * self.r_init)
|
||||
|
||||
def set_r(self, new_r):
|
||||
self.r = new_r
|
||||
|
@ -319,9 +344,9 @@ class Decoder(nn.Module):
|
|||
Reshape the spectrograms for given 'r'
|
||||
"""
|
||||
# Grouping multiple frames if necessary
|
||||
if memory.size(-1) == self.memory_dim:
|
||||
if memory.size(-1) == self.frame_channels:
|
||||
memory = memory.view(memory.shape[0], memory.size(1) // self.r, -1)
|
||||
# Time first (T_decoder, B, memory_dim)
|
||||
# Time first (T_decoder, B, frame_channels)
|
||||
memory = memory.transpose(0, 1)
|
||||
return memory
|
||||
|
||||
|
@ -330,19 +355,18 @@ class Decoder(nn.Module):
|
|||
Initialization of decoder states
|
||||
"""
|
||||
B = inputs.size(0)
|
||||
T = inputs.size(1)
|
||||
# go frame as zeros matrix
|
||||
if self.use_memory_queue:
|
||||
self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim * self.memory_size)
|
||||
self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.frame_channels * self.memory_size)
|
||||
else:
|
||||
self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.memory_dim)
|
||||
self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.frame_channels)
|
||||
# decoder states
|
||||
self.attention_rnn_hidden = torch.zeros(1, device=inputs.device).repeat(B, 256)
|
||||
self.decoder_rnn_hiddens = [
|
||||
torch.zeros(1, device=inputs.device).repeat(B, 256)
|
||||
for idx in range(len(self.decoder_rnns))
|
||||
]
|
||||
self.context_vec = inputs.data.new(B, self.in_features).zero_()
|
||||
self.context_vec = inputs.data.new(B, self.in_channels).zero_()
|
||||
# cache attention inputs
|
||||
self.processed_inputs = self.attention.preprocess_inputs(inputs)
|
||||
|
||||
|
@ -352,7 +376,7 @@ class Decoder(nn.Module):
|
|||
stop_tokens = torch.stack(stop_tokens).transpose(0, 1)
|
||||
outputs = torch.stack(outputs).transpose(0, 1).contiguous()
|
||||
outputs = outputs.view(
|
||||
outputs.size(0), -1, self.memory_dim)
|
||||
outputs.size(0), -1, self.frame_channels)
|
||||
outputs = outputs.transpose(1, 2)
|
||||
return outputs, attentions, stop_tokens
|
||||
|
||||
|
@ -386,7 +410,7 @@ class Decoder(nn.Module):
|
|||
stop_token = self.stopnet(stopnet_input.detach())
|
||||
else:
|
||||
stop_token = self.stopnet(stopnet_input)
|
||||
output = output[:, : self.r * self.memory_dim]
|
||||
output = output[:, : self.r * self.frame_channels]
|
||||
return output, stop_token, self.attention.attention_weights
|
||||
|
||||
def _update_memory_input(self, new_memory):
|
||||
|
@ -395,17 +419,17 @@ class Decoder(nn.Module):
|
|||
# memory queue size is larger than number of frames per decoder iter
|
||||
self.memory_input = torch.cat([
|
||||
new_memory, self.memory_input[:, :(
|
||||
self.memory_size - self.r) * self.memory_dim].clone()
|
||||
self.memory_size - self.r) * self.frame_channels].clone()
|
||||
], dim=-1)
|
||||
else:
|
||||
# memory queue size smaller than number of frames per decoder iter
|
||||
self.memory_input = new_memory[:, :self.memory_size * self.memory_dim]
|
||||
self.memory_input = new_memory[:, :self.memory_size * self.frame_channels]
|
||||
else:
|
||||
# use only the last frame prediction
|
||||
# assert new_memory.shape[-1] == self.r * self.memory_dim
|
||||
self.memory_input = new_memory[:, self.memory_dim * (self.r - 1):]
|
||||
# assert new_memory.shape[-1] == self.r * self.frame_channels
|
||||
self.memory_input = new_memory[:, self.frame_channels * (self.r - 1):]
|
||||
|
||||
def forward(self, inputs, memory, mask, speaker_embeddings=None):
|
||||
def forward(self, inputs, memory, mask):
|
||||
"""
|
||||
Args:
|
||||
inputs: Encoder outputs.
|
||||
|
@ -415,8 +439,8 @@ class Decoder(nn.Module):
|
|||
mask: Attention mask for sequence padding.
|
||||
|
||||
Shapes:
|
||||
- inputs: batch x time x encoder_out_dim
|
||||
- memory: batch x #mel_specs x mel_spec_dim
|
||||
- inputs: (B, T, D_out_enc)
|
||||
- memory: (B, T_mel, D_mel)
|
||||
"""
|
||||
# Run greedy decoding if memory is None
|
||||
memory = self._reshape_memory(memory)
|
||||
|
@ -430,8 +454,7 @@ class Decoder(nn.Module):
|
|||
if t > 0:
|
||||
new_memory = memory[t - 1]
|
||||
self._update_memory_input(new_memory)
|
||||
if speaker_embeddings is not None:
|
||||
self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1)
|
||||
|
||||
output, stop_token, attention = self.decode(inputs, mask)
|
||||
outputs += [output]
|
||||
attentions += [attention]
|
||||
|
@ -439,15 +462,12 @@ class Decoder(nn.Module):
|
|||
t += 1
|
||||
return self._parse_outputs(outputs, attentions, stop_tokens)
|
||||
|
||||
def inference(self, inputs, speaker_embeddings=None):
|
||||
def inference(self, inputs):
|
||||
"""
|
||||
Args:
|
||||
inputs: encoder outputs.
|
||||
speaker_embeddings: speaker vectors.
|
||||
|
||||
Shapes:
|
||||
- inputs: batch x time x encoder_out_dim
|
||||
- speaker_embeddings: batch x embed_dim
|
||||
"""
|
||||
outputs = []
|
||||
attentions = []
|
||||
|
@ -460,8 +480,6 @@ class Decoder(nn.Module):
|
|||
if t > 0:
|
||||
new_memory = outputs[-1]
|
||||
self._update_memory_input(new_memory)
|
||||
if speaker_embeddings is not None:
|
||||
self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1)
|
||||
output, stop_token, attention = self.decode(inputs, None)
|
||||
stop_token = torch.sigmoid(stop_token.data)
|
||||
outputs += [output]
|
||||
|
@ -471,14 +489,14 @@ class Decoder(nn.Module):
|
|||
if t > inputs.shape[1] / 4 and (stop_token > 0.6
|
||||
or attention[:, -1].item() > 0.6):
|
||||
break
|
||||
elif t > self.max_decoder_steps:
|
||||
if t > self.max_decoder_steps:
|
||||
print(" | > Decoder stopped with 'max_decoder_steps")
|
||||
break
|
||||
return self._parse_outputs(outputs, attentions, stop_tokens)
|
||||
|
||||
|
||||
class StopNet(nn.Module):
|
||||
r"""
|
||||
r"""Stopnet signalling decoder to stop inference.
|
||||
Args:
|
||||
in_features (int): feature dimension of input.
|
||||
"""
|
|
@ -1,11 +1,24 @@
|
|||
import torch
|
||||
from torch.autograd import Variable
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
from .common_layers import init_attn, Prenet, Linear
|
||||
|
||||
|
||||
# NOTE: linter has a problem with the current TF release
|
||||
#pylint: disable=no-value-for-parameter
|
||||
#pylint: disable=unexpected-keyword-arg
|
||||
class ConvBNBlock(nn.Module):
|
||||
r"""Convolutions with Batch Normalization and non-linear activation.
|
||||
|
||||
Args:
|
||||
in_channels (int): number of input channels.
|
||||
out_channels (int): number of output channels.
|
||||
kernel_size (int): convolution kernel size.
|
||||
activation (str): 'relu', 'tanh', None (linear).
|
||||
|
||||
Shapes:
|
||||
- input: (B, C_in, T)
|
||||
- output: (B, C_out, T)
|
||||
"""
|
||||
def __init__(self, in_channels, out_channels, kernel_size, activation=None):
|
||||
super(ConvBNBlock, self).__init__()
|
||||
assert (kernel_size - 1) % 2 == 0
|
||||
|
@ -32,16 +45,25 @@ class ConvBNBlock(nn.Module):
|
|||
|
||||
|
||||
class Postnet(nn.Module):
|
||||
def __init__(self, output_dim, num_convs=5):
|
||||
r"""Tacotron2 Postnet
|
||||
|
||||
Args:
|
||||
in_out_channels (int): number of output channels.
|
||||
|
||||
Shapes:
|
||||
- input: (B, C_in, T)
|
||||
- output: (B, C_in, T)
|
||||
"""
|
||||
def __init__(self, in_out_channels, num_convs=5):
|
||||
super(Postnet, self).__init__()
|
||||
self.convolutions = nn.ModuleList()
|
||||
self.convolutions.append(
|
||||
ConvBNBlock(output_dim, 512, kernel_size=5, activation='tanh'))
|
||||
ConvBNBlock(in_out_channels, 512, kernel_size=5, activation='tanh'))
|
||||
for _ in range(1, num_convs - 1):
|
||||
self.convolutions.append(
|
||||
ConvBNBlock(512, 512, kernel_size=5, activation='tanh'))
|
||||
self.convolutions.append(
|
||||
ConvBNBlock(512, output_dim, kernel_size=5, activation=None))
|
||||
ConvBNBlock(512, in_out_channels, kernel_size=5, activation=None))
|
||||
|
||||
def forward(self, x):
|
||||
o = x
|
||||
|
@ -51,14 +73,23 @@ class Postnet(nn.Module):
|
|||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
def __init__(self, output_input_dim=512):
|
||||
r"""Tacotron2 Encoder
|
||||
|
||||
Args:
|
||||
in_out_channels (int): number of input and output channels.
|
||||
|
||||
Shapes:
|
||||
- input: (B, C_in, T)
|
||||
- output: (B, C_in, T)
|
||||
"""
|
||||
def __init__(self, in_out_channels=512):
|
||||
super(Encoder, self).__init__()
|
||||
self.convolutions = nn.ModuleList()
|
||||
for _ in range(3):
|
||||
self.convolutions.append(
|
||||
ConvBNBlock(output_input_dim, output_input_dim, 5, 'relu'))
|
||||
self.lstm = nn.LSTM(output_input_dim,
|
||||
int(output_input_dim / 2),
|
||||
ConvBNBlock(in_out_channels, in_out_channels, 5, 'relu'))
|
||||
self.lstm = nn.LSTM(in_out_channels,
|
||||
int(in_out_channels / 2),
|
||||
num_layers=1,
|
||||
batch_first=True,
|
||||
bias=True,
|
||||
|
@ -90,20 +121,40 @@ class Encoder(nn.Module):
|
|||
|
||||
# adapted from https://github.com/NVIDIA/tacotron2/
|
||||
class Decoder(nn.Module):
|
||||
"""Tacotron2 decoder. We don't use Zoneout but Dropout between RNN layers.
|
||||
|
||||
Args:
|
||||
in_channels (int): number of input channels.
|
||||
frame_channels (int): number of feature frame channels.
|
||||
r (int): number of outputs per time step (reduction rate).
|
||||
memory_size (int): size of the past window. if <= 0 memory_size = r
|
||||
attn_type (string): type of attention used in decoder.
|
||||
attn_win (bool): if true, define an attention window centered to maximum
|
||||
attention response. It provides more robust attention alignment especially
|
||||
at interence time.
|
||||
attn_norm (string): attention normalization function. 'sigmoid' or 'softmax'.
|
||||
prenet_type (string): 'original' or 'bn'.
|
||||
prenet_dropout (float): prenet dropout rate.
|
||||
forward_attn (bool): if true, use forward attention method. https://arxiv.org/abs/1807.06736
|
||||
trans_agent (bool): if true, use transition agent. https://arxiv.org/abs/1807.06736
|
||||
forward_attn_mask (bool): if true, mask attention values smaller than a threshold.
|
||||
location_attn (bool): if true, use location sensitive attention.
|
||||
attn_K (int): number of attention heads for GravesAttention.
|
||||
separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow.
|
||||
"""
|
||||
# Pylint gets confused by PyTorch conventions here
|
||||
#pylint: disable=attribute-defined-outside-init
|
||||
def __init__(self, input_dim, frame_dim, r, attn_type, attn_win, attn_norm,
|
||||
def __init__(self, in_channels, frame_channels, r, attn_type, attn_win, attn_norm,
|
||||
prenet_type, prenet_dropout, forward_attn, trans_agent,
|
||||
forward_attn_mask, location_attn, attn_K, separate_stopnet,
|
||||
speaker_embedding_dim):
|
||||
forward_attn_mask, location_attn, attn_K, separate_stopnet):
|
||||
super(Decoder, self).__init__()
|
||||
self.frame_dim = frame_dim
|
||||
self.frame_channels = frame_channels
|
||||
self.r_init = r
|
||||
self.r = r
|
||||
self.encoder_embedding_dim = input_dim
|
||||
self.encoder_embedding_dim = in_channels
|
||||
self.separate_stopnet = separate_stopnet
|
||||
self.max_decoder_steps = 1000
|
||||
self.gate_threshold = 0.5
|
||||
self.stop_threshold = 0.5
|
||||
|
||||
# model dimensions
|
||||
self.query_dim = 1024
|
||||
|
@ -114,20 +165,20 @@ class Decoder(nn.Module):
|
|||
self.p_decoder_dropout = 0.1
|
||||
|
||||
# memory -> |Prenet| -> processed_memory
|
||||
prenet_dim = self.frame_dim
|
||||
prenet_dim = self.frame_channels
|
||||
self.prenet = Prenet(prenet_dim,
|
||||
prenet_type,
|
||||
prenet_dropout,
|
||||
out_features=[self.prenet_dim, self.prenet_dim],
|
||||
bias=False)
|
||||
|
||||
self.attention_rnn = nn.LSTMCell(self.prenet_dim + input_dim,
|
||||
self.attention_rnn = nn.LSTMCell(self.prenet_dim + in_channels,
|
||||
self.query_dim,
|
||||
bias=True)
|
||||
|
||||
self.attention = init_attn(attn_type=attn_type,
|
||||
query_dim=self.query_dim,
|
||||
embedding_dim=input_dim,
|
||||
embedding_dim=in_channels,
|
||||
attention_dim=128,
|
||||
location_attention=location_attn,
|
||||
attention_location_n_filters=32,
|
||||
|
@ -139,16 +190,16 @@ class Decoder(nn.Module):
|
|||
forward_attn_mask=forward_attn_mask,
|
||||
attn_K=attn_K)
|
||||
|
||||
self.decoder_rnn = nn.LSTMCell(self.query_dim + input_dim,
|
||||
self.decoder_rnn = nn.LSTMCell(self.query_dim + in_channels,
|
||||
self.decoder_rnn_dim,
|
||||
bias=True)
|
||||
|
||||
self.linear_projection = Linear(self.decoder_rnn_dim + input_dim,
|
||||
self.frame_dim * self.r_init)
|
||||
self.linear_projection = Linear(self.decoder_rnn_dim + in_channels,
|
||||
self.frame_channels * self.r_init)
|
||||
|
||||
self.stopnet = nn.Sequential(
|
||||
nn.Dropout(0.1),
|
||||
Linear(self.decoder_rnn_dim + self.frame_dim * self.r_init,
|
||||
Linear(self.decoder_rnn_dim + self.frame_channels * self.r_init,
|
||||
1,
|
||||
bias=True,
|
||||
init_gain='sigmoid'))
|
||||
|
@ -159,8 +210,8 @@ class Decoder(nn.Module):
|
|||
|
||||
def get_go_frame(self, inputs):
|
||||
B = inputs.size(0)
|
||||
memory = torch.zeros(1, device=inputs.device).repeat(B,
|
||||
self.frame_dim * self.r)
|
||||
memory = torch.zeros(1, device=inputs.device).repeat(
|
||||
B, self.frame_channels * self.r)
|
||||
return memory
|
||||
|
||||
def _init_states(self, inputs, mask, keep_states=False):
|
||||
|
@ -186,9 +237,9 @@ class Decoder(nn.Module):
|
|||
Reshape the spectrograms for given 'r'
|
||||
"""
|
||||
# Grouping multiple frames if necessary
|
||||
if memory.size(-1) == self.frame_dim:
|
||||
if memory.size(-1) == self.frame_channels:
|
||||
memory = memory.view(memory.shape[0], memory.size(1) // self.r, -1)
|
||||
# Time first (T_decoder, B, frame_dim)
|
||||
# Time first (T_decoder, B, frame_channels)
|
||||
memory = memory.transpose(0, 1)
|
||||
return memory
|
||||
|
||||
|
@ -196,22 +247,22 @@ class Decoder(nn.Module):
|
|||
alignments = torch.stack(alignments).transpose(0, 1)
|
||||
stop_tokens = torch.stack(stop_tokens).transpose(0, 1)
|
||||
outputs = torch.stack(outputs).transpose(0, 1).contiguous()
|
||||
outputs = outputs.view(outputs.size(0), -1, self.frame_dim)
|
||||
outputs = outputs.view(outputs.size(0), -1, self.frame_channels)
|
||||
outputs = outputs.transpose(1, 2)
|
||||
return outputs, stop_tokens, alignments
|
||||
|
||||
def _update_memory(self, memory):
|
||||
if len(memory.shape) == 2:
|
||||
return memory[:, self.frame_dim * (self.r - 1):]
|
||||
return memory[:, :, self.frame_dim * (self.r - 1):]
|
||||
return memory[:, self.frame_channels * (self.r - 1):]
|
||||
return memory[:, :, self.frame_channels * (self.r - 1):]
|
||||
|
||||
def decode(self, memory):
|
||||
'''
|
||||
shapes:
|
||||
- memory: B x r * self.frame_dim
|
||||
- memory: B x r * self.frame_channels
|
||||
'''
|
||||
# self.context: B x D_en
|
||||
# query_input: B x D_en + (r * self.frame_dim)
|
||||
# query_input: B x D_en + (r * self.frame_channels)
|
||||
query_input = torch.cat((memory, self.context), -1)
|
||||
# self.query and self.attention_rnn_cell_state : B x D_attn_rnn
|
||||
self.query, self.attention_rnn_cell_state = self.attention_rnn(
|
||||
|
@ -234,25 +285,36 @@ class Decoder(nn.Module):
|
|||
# B x (D_decoder_rnn + D_en)
|
||||
decoder_hidden_context = torch.cat((self.decoder_hidden, self.context),
|
||||
dim=1)
|
||||
# B x (self.r * self.frame_dim)
|
||||
# B x (self.r * self.frame_channels)
|
||||
decoder_output = self.linear_projection(decoder_hidden_context)
|
||||
# B x (D_decoder_rnn + (self.r * self.frame_dim))
|
||||
# B x (D_decoder_rnn + (self.r * self.frame_channels))
|
||||
stopnet_input = torch.cat((self.decoder_hidden, decoder_output), dim=1)
|
||||
if self.separate_stopnet:
|
||||
stop_token = self.stopnet(stopnet_input.detach())
|
||||
else:
|
||||
stop_token = self.stopnet(stopnet_input)
|
||||
# select outputs for the reduction rate self.r
|
||||
decoder_output = decoder_output[:, :self.r * self.frame_dim]
|
||||
decoder_output = decoder_output[:, :self.r * self.frame_channels]
|
||||
return decoder_output, self.attention.attention_weights, stop_token
|
||||
|
||||
def forward(self, inputs, memories, mask, speaker_embeddings=None):
|
||||
def forward(self, inputs, memories, mask):
|
||||
r"""Train Decoder with teacher forcing.
|
||||
Args:
|
||||
inputs: Encoder outputs.
|
||||
memories: Feature frames for teacher-forcing.
|
||||
mask: Attention mask for sequence padding.
|
||||
|
||||
Shapes:
|
||||
- inputs: (B, T, D_out_enc)
|
||||
- memory: (B, T_mel, D_mel)
|
||||
- outputs: (B, T_mel, D_mel)
|
||||
- alignments: (B, T_in, T_out)
|
||||
- stop_tokens: (B, T_out)
|
||||
"""
|
||||
memory = self.get_go_frame(inputs).unsqueeze(0)
|
||||
memories = self._reshape_memory(memories)
|
||||
memories = torch.cat((memory, memories), dim=0)
|
||||
memories = self._update_memory(memories)
|
||||
if speaker_embeddings is not None:
|
||||
memories = torch.cat([memories, speaker_embeddings], dim=-1)
|
||||
memories = self.prenet(memories)
|
||||
|
||||
self._init_states(inputs, mask=mask)
|
||||
|
@ -270,7 +332,18 @@ class Decoder(nn.Module):
|
|||
outputs, stop_tokens, alignments)
|
||||
return outputs, alignments, stop_tokens
|
||||
|
||||
def inference(self, inputs, speaker_embeddings=None):
|
||||
def inference(self, inputs):
|
||||
r"""Decoder inference without teacher forcing and use
|
||||
Stopnet to stop decoder.
|
||||
Args:
|
||||
inputs: Encoder outputs.
|
||||
|
||||
Shapes:
|
||||
- inputs: (B, T, D_out_enc)
|
||||
- outputs: (B, T_mel, D_mel)
|
||||
- alignments: (B, T_in, T_out)
|
||||
- stop_tokens: (B, T_out)
|
||||
"""
|
||||
memory = self.get_go_frame(inputs)
|
||||
memory = self._update_memory(memory)
|
||||
|
||||
|
@ -280,15 +353,13 @@ class Decoder(nn.Module):
|
|||
outputs, stop_tokens, alignments, t = [], [], [], 0
|
||||
while True:
|
||||
memory = self.prenet(memory)
|
||||
if speaker_embeddings is not None:
|
||||
memory = torch.cat([memory, speaker_embeddings], dim=-1)
|
||||
decoder_output, alignment, stop_token = self.decode(memory)
|
||||
stop_token = torch.sigmoid(stop_token.data)
|
||||
outputs += [decoder_output.squeeze(1)]
|
||||
stop_tokens += [stop_token]
|
||||
alignments += [alignment]
|
||||
|
||||
if stop_token > 0.7 and t > inputs.shape[0] / 2:
|
||||
if stop_token > self.stop_threshold and t > inputs.shape[0] // 2:
|
||||
break
|
||||
if len(outputs) == self.max_decoder_steps:
|
||||
print(" | > Decoder stopped with 'max_decoder_steps")
|
||||
|
@ -315,7 +386,6 @@ class Decoder(nn.Module):
|
|||
self.attention.init_win_idx()
|
||||
self.attention.init_states(inputs)
|
||||
outputs, stop_tokens, alignments, t = [], [], [], 0
|
||||
stop_flags = [True, False, False]
|
||||
while True:
|
||||
memory = self.prenet(self.memory_truncated)
|
||||
decoder_output, alignment, stop_token = self.decode(memory)
|
|
@ -2,9 +2,9 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
|
||||
from TTS.layers.gst_layers import GST
|
||||
from TTS.layers.tacotron import Decoder, Encoder, PostCBHG
|
||||
from TTS.models.tacotron_abstract import TacotronAbstract
|
||||
from mozilla_voice_tts.tts.layers.gst_layers import GST
|
||||
from mozilla_voice_tts.tts.layers.tacotron import Decoder, Encoder, PostCBHG
|
||||
from mozilla_voice_tts.tts.models.tacotron_abstract import TacotronAbstract
|
||||
|
||||
|
||||
class Tacotron(TacotronAbstract):
|
||||
|
@ -28,6 +28,9 @@ class Tacotron(TacotronAbstract):
|
|||
bidirectional_decoder=False,
|
||||
double_decoder_consistency=False,
|
||||
ddc_r=None,
|
||||
encoder_in_features=256,
|
||||
decoder_in_features=256,
|
||||
speaker_embedding_dim=None,
|
||||
gst=False,
|
||||
gst_embedding_dim=256,
|
||||
gst_num_heads=4,
|
||||
|
@ -40,31 +43,36 @@ class Tacotron(TacotronAbstract):
|
|||
forward_attn, trans_agent, forward_attn_mask,
|
||||
location_attn, attn_K, separate_stopnet,
|
||||
bidirectional_decoder, double_decoder_consistency,
|
||||
ddc_r, gst)
|
||||
decoder_in_features = 512 if num_speakers > 1 else 256
|
||||
encoder_in_features = 512 if num_speakers > 1 else 256
|
||||
speaker_embedding_dim = 256
|
||||
proj_speaker_dim = 80 if num_speakers > 1 else 0
|
||||
# base model layers
|
||||
ddc_r, encoder_in_features, decoder_in_features,
|
||||
speaker_embedding_dim, gst, gst_embedding_dim,
|
||||
gst_num_heads, gst_style_tokens)
|
||||
|
||||
# speaker embedding layers
|
||||
if self.num_speakers > 1:
|
||||
if not self.embeddings_per_sample:
|
||||
speaker_embedding_dim = 256
|
||||
self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim)
|
||||
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
||||
|
||||
# speaker and gst embeddings is concat in decoder input
|
||||
if self.num_speakers > 1:
|
||||
self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim
|
||||
|
||||
# embedding layer
|
||||
self.embedding = nn.Embedding(num_chars, 256, padding_idx=0)
|
||||
self.embedding.weight.data.normal_(0, 0.3)
|
||||
self.encoder = Encoder(encoder_in_features)
|
||||
self.decoder = Decoder(decoder_in_features, decoder_output_dim, r,
|
||||
|
||||
# base model layers
|
||||
self.encoder = Encoder(self.encoder_in_features)
|
||||
self.decoder = Decoder(self.decoder_in_features, decoder_output_dim, r,
|
||||
memory_size, attn_type, attn_win, attn_norm,
|
||||
prenet_type, prenet_dropout, forward_attn,
|
||||
trans_agent, forward_attn_mask, location_attn,
|
||||
attn_K, separate_stopnet, proj_speaker_dim)
|
||||
attn_K, separate_stopnet)
|
||||
self.postnet = PostCBHG(decoder_output_dim)
|
||||
self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2,
|
||||
postnet_output_dim)
|
||||
# speaker embedding layers
|
||||
if num_speakers > 1:
|
||||
self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim)
|
||||
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
||||
self.speaker_project_mel = nn.Sequential(
|
||||
nn.Linear(speaker_embedding_dim, proj_speaker_dim), nn.Tanh())
|
||||
self.speaker_embeddings = None
|
||||
self.speaker_embeddings_projected = None
|
||||
|
||||
# global style token layers
|
||||
if self.gst:
|
||||
self.gst_layer = GST(num_mel=80,
|
||||
|
@ -77,13 +85,12 @@ class Tacotron(TacotronAbstract):
|
|||
# setup DDC
|
||||
if self.double_decoder_consistency:
|
||||
self.coarse_decoder = Decoder(
|
||||
decoder_in_features, decoder_output_dim, ddc_r, memory_size,
|
||||
self.decoder_in_features, decoder_output_dim, ddc_r, memory_size,
|
||||
attn_type, attn_win, attn_norm, prenet_type, prenet_dropout,
|
||||
forward_attn, trans_agent, forward_attn_mask, location_attn,
|
||||
attn_K, separate_stopnet, proj_speaker_dim)
|
||||
attn_K, separate_stopnet)
|
||||
|
||||
|
||||
def forward(self, characters, text_lengths, mel_specs, mel_lengths=None, speaker_ids=None):
|
||||
def forward(self, characters, text_lengths, mel_specs, mel_lengths=None, speaker_ids=None, speaker_embeddings=None):
|
||||
"""
|
||||
Shapes:
|
||||
- characters: B x T_in
|
||||
|
@ -91,17 +98,9 @@ class Tacotron(TacotronAbstract):
|
|||
- mel_specs: B x T_out x D
|
||||
- speaker_ids: B x 1
|
||||
"""
|
||||
self._init_states()
|
||||
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
|
||||
# B x T_in x embed_dim
|
||||
inputs = self.embedding(characters)
|
||||
# B x speaker_embed_dim
|
||||
if speaker_ids is not None:
|
||||
self.compute_speaker_embedding(speaker_ids)
|
||||
if self.num_speakers > 1:
|
||||
# B x T_in x embed_dim + speaker_embed_dim
|
||||
inputs = self._concat_speaker_embedding(inputs,
|
||||
self.speaker_embeddings)
|
||||
# B x T_in x encoder_in_features
|
||||
encoder_outputs = self.encoder(inputs)
|
||||
# sequence masking
|
||||
|
@ -110,15 +109,20 @@ class Tacotron(TacotronAbstract):
|
|||
if self.gst:
|
||||
# B x gst_dim
|
||||
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
|
||||
# speaker embedding
|
||||
if self.num_speakers > 1:
|
||||
encoder_outputs = self._concat_speaker_embedding(
|
||||
encoder_outputs, self.speaker_embeddings)
|
||||
if not self.embeddings_per_sample:
|
||||
# B x 1 x speaker_embed_dim
|
||||
speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
|
||||
else:
|
||||
# B x 1 x speaker_embed_dim
|
||||
speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1)
|
||||
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
|
||||
# decoder_outputs: B x decoder_in_features x T_out
|
||||
# alignments: B x T_in x encoder_in_features
|
||||
# stop_tokens: B x T_in
|
||||
decoder_outputs, alignments, stop_tokens = self.decoder(
|
||||
encoder_outputs, mel_specs, input_mask,
|
||||
self.speaker_embeddings_projected)
|
||||
encoder_outputs, mel_specs, input_mask)
|
||||
# sequence masking
|
||||
if output_mask is not None:
|
||||
decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs)
|
||||
|
@ -140,22 +144,22 @@ class Tacotron(TacotronAbstract):
|
|||
return decoder_outputs, postnet_outputs, alignments, stop_tokens
|
||||
|
||||
@torch.no_grad()
|
||||
def inference(self, characters, speaker_ids=None, style_mel=None):
|
||||
def inference(self, characters, speaker_ids=None, style_mel=None, speaker_embeddings=None):
|
||||
inputs = self.embedding(characters)
|
||||
self._init_states()
|
||||
if speaker_ids is not None:
|
||||
self.compute_speaker_embedding(speaker_ids)
|
||||
if self.num_speakers > 1:
|
||||
inputs = self._concat_speaker_embedding(inputs,
|
||||
self.speaker_embeddings)
|
||||
encoder_outputs = self.encoder(inputs)
|
||||
if self.gst and style_mel is not None:
|
||||
if self.gst:
|
||||
# B x gst_dim
|
||||
encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
|
||||
if self.num_speakers > 1:
|
||||
encoder_outputs = self._concat_speaker_embedding(
|
||||
encoder_outputs, self.speaker_embeddings)
|
||||
if not self.embeddings_per_sample:
|
||||
# B x 1 x speaker_embed_dim
|
||||
speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
|
||||
else:
|
||||
# B x 1 x speaker_embed_dim
|
||||
speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1)
|
||||
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
|
||||
decoder_outputs, alignments, stop_tokens = self.decoder.inference(
|
||||
encoder_outputs, self.speaker_embeddings_projected)
|
||||
encoder_outputs)
|
||||
postnet_outputs = self.postnet(decoder_outputs)
|
||||
postnet_outputs = self.last_linear(postnet_outputs)
|
||||
decoder_outputs = decoder_outputs.transpose(1, 2)
|
|
@ -1,10 +1,9 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
|
||||
from TTS.layers.gst_layers import GST
|
||||
from TTS.layers.tacotron2 import Decoder, Encoder, Postnet
|
||||
from TTS.models.tacotron_abstract import TacotronAbstract
|
||||
|
||||
from mozilla_voice_tts.tts.layers.gst_layers import GST
|
||||
from mozilla_voice_tts.tts.layers.tacotron2 import Decoder, Encoder, Postnet
|
||||
from mozilla_voice_tts.tts.models.tacotron_abstract import TacotronAbstract
|
||||
|
||||
# TODO: match function arguments with tacotron
|
||||
class Tacotron2(TacotronAbstract):
|
||||
|
@ -28,6 +27,9 @@ class Tacotron2(TacotronAbstract):
|
|||
bidirectional_decoder=False,
|
||||
double_decoder_consistency=False,
|
||||
ddc_r=None,
|
||||
encoder_in_features=512,
|
||||
decoder_in_features=512,
|
||||
speaker_embedding_dim=None,
|
||||
gst=False,
|
||||
gst_embedding_dim=512,
|
||||
gst_num_heads=4,
|
||||
|
@ -39,46 +41,48 @@ class Tacotron2(TacotronAbstract):
|
|||
forward_attn, trans_agent, forward_attn_mask,
|
||||
location_attn, attn_K, separate_stopnet,
|
||||
bidirectional_decoder, double_decoder_consistency,
|
||||
ddc_r, gst)
|
||||
ddc_r, encoder_in_features, decoder_in_features,
|
||||
speaker_embedding_dim, gst, gst_embedding_dim,
|
||||
gst_num_heads, gst_style_tokens)
|
||||
|
||||
# init layer dims
|
||||
speaker_embedding_dim = 512 if num_speakers > 1 else 0
|
||||
gst_embedding_dim = gst_embedding_dim if self.gst else 0
|
||||
decoder_in_features = 512+speaker_embedding_dim+gst_embedding_dim
|
||||
encoder_in_features = 512 if num_speakers > 1 else 512
|
||||
proj_speaker_dim = 80 if num_speakers > 1 else 0
|
||||
# speaker embedding layer
|
||||
if self.num_speakers > 1:
|
||||
if not self.embeddings_per_sample:
|
||||
speaker_embedding_dim = 512
|
||||
self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim)
|
||||
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
||||
|
||||
# speaker and gst embeddings is concat in decoder input
|
||||
if self.num_speakers > 1:
|
||||
self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim
|
||||
|
||||
# embedding layer
|
||||
self.embedding = nn.Embedding(num_chars, 512, padding_idx=0)
|
||||
|
||||
# speaker embedding layer
|
||||
if num_speakers > 1:
|
||||
self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim)
|
||||
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
||||
|
||||
self.encoder = Encoder(encoder_in_features)
|
||||
self.decoder = Decoder(decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win,
|
||||
# base model layers
|
||||
self.encoder = Encoder(self.encoder_in_features)
|
||||
self.decoder = Decoder(self.decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win,
|
||||
attn_norm, prenet_type, prenet_dropout,
|
||||
forward_attn, trans_agent, forward_attn_mask,
|
||||
location_attn, attn_K, separate_stopnet, proj_speaker_dim)
|
||||
location_attn, attn_K, separate_stopnet)
|
||||
self.postnet = Postnet(self.postnet_output_dim)
|
||||
|
||||
# global style token layers
|
||||
if self.gst:
|
||||
self.gst_layer = GST(num_mel=80,
|
||||
num_heads=gst_num_heads,
|
||||
num_style_tokens=gst_style_tokens,
|
||||
embedding_dim=gst_embedding_dim)
|
||||
num_heads=self.gst_num_heads,
|
||||
num_style_tokens=self.gst_style_tokens,
|
||||
embedding_dim=self.gst_embedding_dim)
|
||||
# backward pass decoder
|
||||
if self.bidirectional_decoder:
|
||||
self._init_backward_decoder()
|
||||
# setup DDC
|
||||
if self.double_decoder_consistency:
|
||||
self.coarse_decoder = Decoder(
|
||||
decoder_in_features, self.decoder_output_dim, ddc_r, attn_type,
|
||||
self.decoder_in_features, self.decoder_output_dim, ddc_r, attn_type,
|
||||
attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn,
|
||||
trans_agent, forward_attn_mask, location_attn, attn_K,
|
||||
separate_stopnet, proj_speaker_dim)
|
||||
separate_stopnet)
|
||||
|
||||
@staticmethod
|
||||
def shape_outputs(mel_outputs, mel_outputs_postnet, alignments):
|
||||
|
@ -86,25 +90,7 @@ class Tacotron2(TacotronAbstract):
|
|||
mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2)
|
||||
return mel_outputs, mel_outputs_postnet, alignments
|
||||
|
||||
def compute_gst(self, inputs, style_input):
|
||||
""" Compute global style token """
|
||||
device = inputs.device
|
||||
if isinstance(style_input, dict):
|
||||
query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device)
|
||||
_GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens)
|
||||
gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device)
|
||||
for k_token, v_amplifier in style_input.items():
|
||||
key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1)
|
||||
gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key)
|
||||
gst_outputs = gst_outputs + gst_outputs_att * v_amplifier
|
||||
elif style_input is None:
|
||||
gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device)
|
||||
else:
|
||||
gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable
|
||||
embedded_gst = gst_outputs.repeat(1, inputs.size(1), 1)
|
||||
return inputs, embedded_gst
|
||||
|
||||
def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None):
|
||||
def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None, speaker_embeddings=None):
|
||||
# compute mask for padding
|
||||
# B x T_in_max (boolean)
|
||||
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
|
||||
|
@ -113,20 +99,18 @@ class Tacotron2(TacotronAbstract):
|
|||
# B x T_in_max x D_en
|
||||
encoder_outputs = self.encoder(embedded_inputs, text_lengths)
|
||||
|
||||
if self.gst:
|
||||
# B x gst_dim
|
||||
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
|
||||
|
||||
if self.num_speakers > 1:
|
||||
embedded_speakers = self.speaker_embedding(speaker_ids)[:, None]
|
||||
embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1)
|
||||
if self.gst:
|
||||
# B x gst_dim
|
||||
encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, mel_specs)
|
||||
encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1)
|
||||
if not self.embeddings_per_sample:
|
||||
# B x 1 x speaker_embed_dim
|
||||
speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
|
||||
else:
|
||||
encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1)
|
||||
else:
|
||||
if self.gst:
|
||||
# B x gst_dim
|
||||
encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, mel_specs)
|
||||
encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1)
|
||||
# B x 1 x speaker_embed_dim
|
||||
speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1)
|
||||
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
|
||||
|
||||
encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
|
||||
|
||||
|
@ -154,24 +138,18 @@ class Tacotron2(TacotronAbstract):
|
|||
return decoder_outputs, postnet_outputs, alignments, stop_tokens
|
||||
|
||||
@torch.no_grad()
|
||||
def inference(self, text, speaker_ids=None, style_mel=None):
|
||||
def inference(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None):
|
||||
embedded_inputs = self.embedding(text).transpose(1, 2)
|
||||
encoder_outputs = self.encoder.inference(embedded_inputs)
|
||||
|
||||
if self.gst:
|
||||
# B x gst_dim
|
||||
encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
|
||||
|
||||
if self.num_speakers > 1:
|
||||
embedded_speakers = self.speaker_embedding(speaker_ids)[:, None]
|
||||
embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1)
|
||||
if self.gst:
|
||||
# B x gst_dim
|
||||
encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel)
|
||||
encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1)
|
||||
else:
|
||||
encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1)
|
||||
else:
|
||||
if self.gst:
|
||||
# B x gst_dim
|
||||
encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel)
|
||||
encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1)
|
||||
if not self.embeddings_per_sample:
|
||||
speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
|
||||
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
|
||||
|
||||
decoder_outputs, alignments, stop_tokens = self.decoder.inference(
|
||||
encoder_outputs)
|
||||
|
@ -181,27 +159,21 @@ class Tacotron2(TacotronAbstract):
|
|||
decoder_outputs, postnet_outputs, alignments)
|
||||
return decoder_outputs, postnet_outputs, alignments, stop_tokens
|
||||
|
||||
def inference_truncated(self, text, speaker_ids=None, style_mel=None):
|
||||
def inference_truncated(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None):
|
||||
"""
|
||||
Preserve model states for continuous inference
|
||||
"""
|
||||
embedded_inputs = self.embedding(text).transpose(1, 2)
|
||||
encoder_outputs = self.encoder.inference_truncated(embedded_inputs)
|
||||
|
||||
if self.gst:
|
||||
# B x gst_dim
|
||||
encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
|
||||
|
||||
if self.num_speakers > 1:
|
||||
embedded_speakers = self.speaker_embedding(speaker_ids)[:, None]
|
||||
embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1)
|
||||
if self.gst:
|
||||
# B x gst_dim
|
||||
encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel)
|
||||
encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1)
|
||||
else:
|
||||
encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1)
|
||||
else:
|
||||
if self.gst:
|
||||
# B x gst_dim
|
||||
encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel)
|
||||
encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1)
|
||||
if not self.embeddings_per_sample:
|
||||
speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
|
||||
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
|
||||
|
||||
mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(
|
||||
encoder_outputs)
|
|
@ -4,7 +4,7 @@ from abc import ABC, abstractmethod
|
|||
import torch
|
||||
from torch import nn
|
||||
|
||||
from TTS.utils.generic_utils import sequence_mask
|
||||
from mozilla_voice_tts.tts.utils.generic_utils import sequence_mask
|
||||
|
||||
|
||||
class TacotronAbstract(ABC, nn.Module):
|
||||
|
@ -28,6 +28,9 @@ class TacotronAbstract(ABC, nn.Module):
|
|||
bidirectional_decoder=False,
|
||||
double_decoder_consistency=False,
|
||||
ddc_r=None,
|
||||
encoder_in_features=512,
|
||||
decoder_in_features=512,
|
||||
speaker_embedding_dim=None,
|
||||
gst=False,
|
||||
gst_embedding_dim=512,
|
||||
gst_num_heads=4,
|
||||
|
@ -57,6 +60,9 @@ class TacotronAbstract(ABC, nn.Module):
|
|||
self.location_attn = location_attn
|
||||
self.attn_K = attn_K
|
||||
self.separate_stopnet = separate_stopnet
|
||||
self.encoder_in_features = encoder_in_features
|
||||
self.decoder_in_features = decoder_in_features
|
||||
self.speaker_embedding_dim = speaker_embedding_dim
|
||||
|
||||
# layers
|
||||
self.embedding = None
|
||||
|
@ -64,8 +70,17 @@ class TacotronAbstract(ABC, nn.Module):
|
|||
self.decoder = None
|
||||
self.postnet = None
|
||||
|
||||
# multispeaker
|
||||
if self.speaker_embedding_dim is None:
|
||||
# if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim
|
||||
self.embeddings_per_sample = False
|
||||
else:
|
||||
# if speaker_embedding_dim is not None we need use speaker embedding per sample
|
||||
self.embeddings_per_sample = True
|
||||
|
||||
# global style token
|
||||
if self.gst:
|
||||
self.decoder_in_features += gst_embedding_dim # add gst embedding dim
|
||||
self.gst_layer = None
|
||||
|
||||
# model states
|
||||
|
@ -164,11 +179,22 @@ class TacotronAbstract(ABC, nn.Module):
|
|||
self.speaker_embeddings_projected = self.speaker_project_mel(
|
||||
self.speaker_embeddings).squeeze(1)
|
||||
|
||||
def compute_gst(self, inputs, mel_specs):
|
||||
def compute_gst(self, inputs, style_input):
|
||||
""" Compute global style token """
|
||||
# pylint: disable=not-callable
|
||||
gst_outputs = self.gst_layer(mel_specs)
|
||||
inputs = self._add_speaker_embedding(inputs, gst_outputs)
|
||||
device = inputs.device
|
||||
if isinstance(style_input, dict):
|
||||
query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device)
|
||||
_GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens)
|
||||
gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device)
|
||||
for k_token, v_amplifier in style_input.items():
|
||||
key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1)
|
||||
gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key)
|
||||
gst_outputs = gst_outputs + gst_outputs_att * v_amplifier
|
||||
elif style_input is None:
|
||||
gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device)
|
||||
else:
|
||||
gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable
|
||||
inputs = self._concat_speaker_embedding(inputs, gst_outputs)
|
||||
return inputs
|
||||
|
||||
@staticmethod
|
|
@ -3,6 +3,9 @@ from tensorflow import keras
|
|||
from tensorflow.python.ops import math_ops
|
||||
# from tensorflow_addons.seq2seq import BahdanauAttention
|
||||
|
||||
# NOTE: linter has a problem with the current TF release
|
||||
#pylint: disable=no-value-for-parameter
|
||||
#pylint: disable=unexpected-keyword-arg
|
||||
|
||||
class Linear(keras.layers.Layer):
|
||||
def __init__(self, units, use_bias, **kwargs):
|
|
@ -1,10 +1,12 @@
|
|||
import tensorflow as tf
|
||||
from tensorflow import keras
|
||||
from TTS.tf.utils.tf_utils import shape_list
|
||||
from TTS.tf.layers.common_layers import Prenet, Attention
|
||||
from mozilla_voice_tts.tts.tf.utils.tf_utils import shape_list
|
||||
from mozilla_voice_tts.tts.tf.layers.common_layers import Prenet, Attention
|
||||
# from tensorflow_addons.seq2seq import AttentionWrapper
|
||||
|
||||
|
||||
# NOTE: linter has a problem with the current TF release
|
||||
#pylint: disable=no-value-for-parameter
|
||||
#pylint: disable=unexpected-keyword-arg
|
||||
class ConvBNBlock(keras.layers.Layer):
|
||||
def __init__(self, filters, kernel_size, activation, **kwargs):
|
||||
super(ConvBNBlock, self).__init__(**kwargs)
|
|
@ -1,11 +1,11 @@
|
|||
import tensorflow as tf
|
||||
from tensorflow import keras
|
||||
|
||||
from TTS.tf.layers.tacotron2 import Encoder, Decoder, Postnet
|
||||
from TTS.tf.utils.tf_utils import shape_list
|
||||
from mozilla_voice_tts.tts.tf.layers.tacotron2 import Encoder, Decoder, Postnet
|
||||
from mozilla_voice_tts.tts.tf.utils.tf_utils import shape_list
|
||||
|
||||
|
||||
#pylint: disable=too-many-ancestors
|
||||
#pylint: disable=too-many-ancestors, abstract-method
|
||||
class Tacotron2(keras.models.Model):
|
||||
def __init__(self,
|
||||
num_chars,
|
||||
|
@ -105,4 +105,3 @@ class Tacotron2(keras.models.Model):
|
|||
# TODO: issue https://github.com/PyCQA/pylint/issues/3613
|
||||
input_ids = tf.random.uniform(shape=[1, 4], maxval=10, dtype=tf.int32) #pylint: disable=unexpected-keyword-arg
|
||||
self(input_ids)
|
||||
|
|
@ -1,6 +1,9 @@
|
|||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
# NOTE: linter has a problem with the current TF release
|
||||
#pylint: disable=no-value-for-parameter
|
||||
#pylint: disable=unexpected-keyword-arg
|
||||
|
||||
def tf_create_dummy_inputs():
|
||||
""" Create dummy inputs for TF Tacotron2 model """
|
|
@ -1,4 +1,3 @@
|
|||
import os
|
||||
import datetime
|
||||
import importlib
|
||||
import pickle
|
||||
|
@ -78,7 +77,7 @@ def count_parameters(model, c):
|
|||
|
||||
def setup_model(num_chars, num_speakers, c, enable_tflite=False):
|
||||
print(" > Using model: {}".format(c.model))
|
||||
MyModel = importlib.import_module('TTS.tf.models.' + c.model.lower())
|
||||
MyModel = importlib.import_module('mozilla_voice_tts.tts.tf.models.' + c.model.lower())
|
||||
MyModel = getattr(MyModel, c.model)
|
||||
if c.model.lower() in "tacotron":
|
||||
raise NotImplementedError(' [!] Tacotron model is not ready.')
|
|
@ -39,4 +39,3 @@ def load_tflite_model(tflite_path):
|
|||
tflite_model = tf.lite.Interpreter(model_path=tflite_path)
|
||||
tflite_model.allocate_tensors()
|
||||
return tflite_model
|
||||
|
|
@ -28,4 +28,4 @@ def convert_tacotron2_to_tflite(model,
|
|||
def load_tflite_model(tflite_path):
|
||||
tflite_model = tf.lite.Interpreter(model_path=tflite_path)
|
||||
tflite_model.allocate_tensors()
|
||||
return tflite_model
|
||||
return tflite_model
|
|
@ -74,4 +74,3 @@ class StandardScaler():
|
|||
X *= self.scale_
|
||||
X += self.mean_
|
||||
return X
|
||||
|
|
@ -1,15 +1,11 @@
|
|||
# edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py
|
||||
import os, sys
|
||||
import math
|
||||
import time
|
||||
import subprocess
|
||||
import argparse
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from torch.utils.data.sampler import Sampler
|
||||
from torch.autograd import Variable
|
||||
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
|
||||
from TTS.utils.generic_utils import create_experiment_folder
|
||||
from torch.autograd import Variable
|
||||
from torch.utils.data.sampler import Sampler
|
||||
|
||||
|
||||
class DistributedSampler(Sampler):
|
||||
|
@ -108,7 +104,7 @@ def apply_gradient_allreduce(module):
|
|||
for param in list(module.parameters()):
|
||||
|
||||
def allreduce_hook(*_):
|
||||
Variable._execution_engine.queue_callback(allreduce_params)
|
||||
Variable._execution_engine.queue_callback(allreduce_params) #pylint: disable=protected-access
|
||||
|
||||
if param.requires_grad:
|
||||
param.register_hook(allreduce_hook)
|
||||
|
@ -118,61 +114,3 @@ def apply_gradient_allreduce(module):
|
|||
|
||||
module.register_forward_hook(set_needs_reduction)
|
||||
return module
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
Call train.py as a new process and pass command arguments
|
||||
"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'--continue_path',
|
||||
type=str,
|
||||
help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
|
||||
default='',
|
||||
required='--config_path' not in sys.argv)
|
||||
parser.add_argument(
|
||||
'--restore_path',
|
||||
type=str,
|
||||
help='Model file to be restored. Use to finetune a model.',
|
||||
default='')
|
||||
parser.add_argument(
|
||||
'--config_path',
|
||||
type=str,
|
||||
help='Path to config file for training.',
|
||||
required='--continue_path' not in sys.argv
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# OUT_PATH = create_experiment_folder(CONFIG.output_path, CONFIG.run_name,
|
||||
# True)
|
||||
# stdout_path = os.path.join(OUT_PATH, "process_stdout/")
|
||||
|
||||
num_gpus = torch.cuda.device_count()
|
||||
group_id = time.strftime("%Y_%m_%d-%H%M%S")
|
||||
|
||||
# set arguments for train.py
|
||||
command = ['train.py']
|
||||
command.append('--continue_path={}'.format(args.continue_path))
|
||||
command.append('--restore_path={}'.format(args.restore_path))
|
||||
command.append('--config_path={}'.format(args.config_path))
|
||||
command.append('--group_id=group_{}'.format(group_id))
|
||||
command.append('')
|
||||
|
||||
# run processes
|
||||
processes = []
|
||||
for i in range(num_gpus):
|
||||
my_env = os.environ.copy()
|
||||
my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
|
||||
command[-1] = '--rank={}'.format(i)
|
||||
stdout = None if i == 0 else open(os.devnull, 'w')
|
||||
p = subprocess.Popen(['python3'] + command, stdout=stdout, env=my_env)
|
||||
processes.append(p)
|
||||
print(command)
|
||||
|
||||
for p in processes:
|
||||
p.wait()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -0,0 +1,268 @@
|
|||
import torch
|
||||
import importlib
|
||||
import numpy as np
|
||||
from collections import Counter
|
||||
|
||||
from mozilla_voice_tts.utils.generic_utils import check_argument
|
||||
|
||||
|
||||
def split_dataset(items):
|
||||
is_multi_speaker = False
|
||||
speakers = [item[-1] for item in items]
|
||||
is_multi_speaker = len(set(speakers)) > 1
|
||||
eval_split_size = 500 if len(items) * 0.01 > 500 else int(
|
||||
len(items) * 0.01)
|
||||
assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples."
|
||||
np.random.seed(0)
|
||||
np.random.shuffle(items)
|
||||
if is_multi_speaker:
|
||||
items_eval = []
|
||||
# most stupid code ever -- Fix it !
|
||||
while len(items_eval) < eval_split_size:
|
||||
speakers = [item[-1] for item in items]
|
||||
speaker_counter = Counter(speakers)
|
||||
item_idx = np.random.randint(0, len(items))
|
||||
if speaker_counter[items[item_idx][-1]] > 1:
|
||||
items_eval.append(items[item_idx])
|
||||
del items[item_idx]
|
||||
return items_eval, items
|
||||
return items[:eval_split_size], items[eval_split_size:]
|
||||
|
||||
|
||||
# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1
|
||||
def sequence_mask(sequence_length, max_len=None):
|
||||
if max_len is None:
|
||||
max_len = sequence_length.data.max()
|
||||
batch_size = sequence_length.size(0)
|
||||
seq_range = torch.arange(0, max_len).long()
|
||||
seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
|
||||
if sequence_length.is_cuda:
|
||||
seq_range_expand = seq_range_expand.to(sequence_length.device)
|
||||
seq_length_expand = (
|
||||
sequence_length.unsqueeze(1).expand_as(seq_range_expand))
|
||||
# B x T_max
|
||||
return seq_range_expand < seq_length_expand
|
||||
|
||||
|
||||
def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None):
|
||||
print(" > Using model: {}".format(c.model))
|
||||
MyModel = importlib.import_module('mozilla_voice_tts.tts.models.' + c.model.lower())
|
||||
MyModel = getattr(MyModel, c.model)
|
||||
if c.model.lower() in "tacotron":
|
||||
model = MyModel(num_chars=num_chars,
|
||||
num_speakers=num_speakers,
|
||||
r=c.r,
|
||||
postnet_output_dim=int(c.audio['fft_size'] / 2 + 1),
|
||||
decoder_output_dim=c.audio['num_mels'],
|
||||
gst=c.use_gst,
|
||||
gst_embedding_dim=c.gst['gst_embedding_dim'],
|
||||
gst_num_heads=c.gst['gst_num_heads'],
|
||||
gst_style_tokens=c.gst['gst_style_tokens'],
|
||||
memory_size=c.memory_size,
|
||||
attn_type=c.attention_type,
|
||||
attn_win=c.windowing,
|
||||
attn_norm=c.attention_norm,
|
||||
prenet_type=c.prenet_type,
|
||||
prenet_dropout=c.prenet_dropout,
|
||||
forward_attn=c.use_forward_attn,
|
||||
trans_agent=c.transition_agent,
|
||||
forward_attn_mask=c.forward_attn_mask,
|
||||
location_attn=c.location_attn,
|
||||
attn_K=c.attention_heads,
|
||||
separate_stopnet=c.separate_stopnet,
|
||||
bidirectional_decoder=c.bidirectional_decoder,
|
||||
double_decoder_consistency=c.double_decoder_consistency,
|
||||
ddc_r=c.ddc_r,
|
||||
speaker_embedding_dim=speaker_embedding_dim)
|
||||
elif c.model.lower() == "tacotron2":
|
||||
model = MyModel(num_chars=num_chars,
|
||||
num_speakers=num_speakers,
|
||||
r=c.r,
|
||||
postnet_output_dim=c.audio['num_mels'],
|
||||
decoder_output_dim=c.audio['num_mels'],
|
||||
gst=c.use_gst,
|
||||
gst_embedding_dim=c.gst['gst_embedding_dim'],
|
||||
gst_num_heads=c.gst['gst_num_heads'],
|
||||
gst_style_tokens=c.gst['gst_style_tokens'],
|
||||
attn_type=c.attention_type,
|
||||
attn_win=c.windowing,
|
||||
attn_norm=c.attention_norm,
|
||||
prenet_type=c.prenet_type,
|
||||
prenet_dropout=c.prenet_dropout,
|
||||
forward_attn=c.use_forward_attn,
|
||||
trans_agent=c.transition_agent,
|
||||
forward_attn_mask=c.forward_attn_mask,
|
||||
location_attn=c.location_attn,
|
||||
attn_K=c.attention_heads,
|
||||
separate_stopnet=c.separate_stopnet,
|
||||
bidirectional_decoder=c.bidirectional_decoder,
|
||||
double_decoder_consistency=c.double_decoder_consistency,
|
||||
ddc_r=c.ddc_r,
|
||||
speaker_embedding_dim=speaker_embedding_dim)
|
||||
return model
|
||||
|
||||
|
||||
class KeepAverage():
|
||||
def __init__(self):
|
||||
self.avg_values = {}
|
||||
self.iters = {}
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.avg_values[key]
|
||||
|
||||
def items(self):
|
||||
return self.avg_values.items()
|
||||
|
||||
def add_value(self, name, init_val=0, init_iter=0):
|
||||
self.avg_values[name] = init_val
|
||||
self.iters[name] = init_iter
|
||||
|
||||
def update_value(self, name, value, weighted_avg=False):
|
||||
if name not in self.avg_values:
|
||||
# add value if not exist before
|
||||
self.add_value(name, init_val=value)
|
||||
else:
|
||||
# else update existing value
|
||||
if weighted_avg:
|
||||
self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value
|
||||
self.iters[name] += 1
|
||||
else:
|
||||
self.avg_values[name] = self.avg_values[name] * \
|
||||
self.iters[name] + value
|
||||
self.iters[name] += 1
|
||||
self.avg_values[name] /= self.iters[name]
|
||||
|
||||
def add_values(self, name_dict):
|
||||
for key, value in name_dict.items():
|
||||
self.add_value(key, init_val=value)
|
||||
|
||||
def update_values(self, value_dict):
|
||||
for key, value in value_dict.items():
|
||||
self.update_value(key, value)
|
||||
|
||||
|
||||
def check_config(c):
|
||||
check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str)
|
||||
check_argument('run_name', c, restricted=True, val_type=str)
|
||||
check_argument('run_description', c, val_type=str)
|
||||
|
||||
# AUDIO
|
||||
check_argument('audio', c, restricted=True, val_type=dict)
|
||||
|
||||
# audio processing parameters
|
||||
check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056)
|
||||
check_argument('fft_size', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058)
|
||||
check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000)
|
||||
check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length')
|
||||
check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length')
|
||||
check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1)
|
||||
check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10)
|
||||
check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000)
|
||||
check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5)
|
||||
check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000)
|
||||
|
||||
# vocabulary parameters
|
||||
check_argument('characters', c, restricted=False, val_type=dict)
|
||||
check_argument('pad', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
|
||||
check_argument('eos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
|
||||
check_argument('bos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
|
||||
check_argument('characters', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
|
||||
check_argument('phonemes', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
|
||||
check_argument('punctuations', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str)
|
||||
|
||||
# normalization parameters
|
||||
check_argument('signal_norm', c['audio'], restricted=True, val_type=bool)
|
||||
check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool)
|
||||
check_argument('max_norm', c['audio'], restricted=True, val_type=float, min_val=0.1, max_val=1000)
|
||||
check_argument('clip_norm', c['audio'], restricted=True, val_type=bool)
|
||||
check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000)
|
||||
check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0)
|
||||
check_argument('spec_gain', c['audio'], restricted=True, val_type=[int, float], min_val=1, max_val=100)
|
||||
check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool)
|
||||
check_argument('trim_db', c['audio'], restricted=True, val_type=int)
|
||||
|
||||
# training parameters
|
||||
check_argument('batch_size', c, restricted=True, val_type=int, min_val=1)
|
||||
check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1)
|
||||
check_argument('r', c, restricted=True, val_type=int, min_val=1)
|
||||
check_argument('gradual_training', c, restricted=False, val_type=list)
|
||||
check_argument('loss_masking', c, restricted=True, val_type=bool)
|
||||
check_argument('apex_amp_level', c, restricted=False, val_type=str)
|
||||
# check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100)
|
||||
|
||||
# validation parameters
|
||||
check_argument('run_eval', c, restricted=True, val_type=bool)
|
||||
check_argument('test_delay_epochs', c, restricted=True, val_type=int, min_val=0)
|
||||
check_argument('test_sentences_file', c, restricted=False, val_type=str)
|
||||
|
||||
# optimizer
|
||||
check_argument('noam_schedule', c, restricted=False, val_type=bool)
|
||||
check_argument('grad_clip', c, restricted=True, val_type=float, min_val=0.0)
|
||||
check_argument('epochs', c, restricted=True, val_type=int, min_val=1)
|
||||
check_argument('lr', c, restricted=True, val_type=float, min_val=0)
|
||||
check_argument('wd', c, restricted=True, val_type=float, min_val=0)
|
||||
check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0)
|
||||
check_argument('seq_len_norm', c, restricted=True, val_type=bool)
|
||||
|
||||
# tacotron prenet
|
||||
check_argument('memory_size', c, restricted=True, val_type=int, min_val=-1)
|
||||
check_argument('prenet_type', c, restricted=True, val_type=str, enum_list=['original', 'bn'])
|
||||
check_argument('prenet_dropout', c, restricted=True, val_type=bool)
|
||||
|
||||
# attention
|
||||
check_argument('attention_type', c, restricted=True, val_type=str, enum_list=['graves', 'original'])
|
||||
check_argument('attention_heads', c, restricted=True, val_type=int)
|
||||
check_argument('attention_norm', c, restricted=True, val_type=str, enum_list=['sigmoid', 'softmax'])
|
||||
check_argument('windowing', c, restricted=True, val_type=bool)
|
||||
check_argument('use_forward_attn', c, restricted=True, val_type=bool)
|
||||
check_argument('forward_attn_mask', c, restricted=True, val_type=bool)
|
||||
check_argument('transition_agent', c, restricted=True, val_type=bool)
|
||||
check_argument('transition_agent', c, restricted=True, val_type=bool)
|
||||
check_argument('location_attn', c, restricted=True, val_type=bool)
|
||||
check_argument('bidirectional_decoder', c, restricted=True, val_type=bool)
|
||||
check_argument('double_decoder_consistency', c, restricted=True, val_type=bool)
|
||||
check_argument('ddc_r', c, restricted='double_decoder_consistency' in c.keys(), min_val=1, max_val=7, val_type=int)
|
||||
|
||||
# stopnet
|
||||
check_argument('stopnet', c, restricted=True, val_type=bool)
|
||||
check_argument('separate_stopnet', c, restricted=True, val_type=bool)
|
||||
|
||||
# tensorboard
|
||||
check_argument('print_step', c, restricted=True, val_type=int, min_val=1)
|
||||
check_argument('tb_plot_step', c, restricted=True, val_type=int, min_val=1)
|
||||
check_argument('save_step', c, restricted=True, val_type=int, min_val=1)
|
||||
check_argument('checkpoint', c, restricted=True, val_type=bool)
|
||||
check_argument('tb_model_param_stats', c, restricted=True, val_type=bool)
|
||||
|
||||
# dataloading
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from mozilla_voice_tts.tts.utils.text import cleaners
|
||||
check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=dir(cleaners))
|
||||
check_argument('enable_eos_bos_chars', c, restricted=True, val_type=bool)
|
||||
check_argument('num_loader_workers', c, restricted=True, val_type=int, min_val=0)
|
||||
check_argument('num_val_loader_workers', c, restricted=True, val_type=int, min_val=0)
|
||||
check_argument('batch_group_size', c, restricted=True, val_type=int, min_val=0)
|
||||
check_argument('min_seq_len', c, restricted=True, val_type=int, min_val=0)
|
||||
check_argument('max_seq_len', c, restricted=True, val_type=int, min_val=10)
|
||||
|
||||
# paths
|
||||
check_argument('output_path', c, restricted=True, val_type=str)
|
||||
|
||||
# multi-speaker and gst
|
||||
check_argument('use_speaker_embedding', c, restricted=True, val_type=bool)
|
||||
check_argument('use_external_speaker_embedding_file', c, restricted=True, val_type=bool)
|
||||
check_argument('external_speaker_embedding_file', c, restricted=True, val_type=str)
|
||||
check_argument('use_gst', c, restricted=True, val_type=bool)
|
||||
check_argument('gst', c, restricted=True, val_type=dict)
|
||||
check_argument('gst_style_input', c['gst'], restricted=True, val_type=[str, dict])
|
||||
check_argument('gst_embedding_dim', c['gst'], restricted=True, val_type=int, min_val=0, max_val=1000)
|
||||
check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=2, max_val=10)
|
||||
check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1, max_val=1000)
|
||||
|
||||
# datasets - checking only the first entry
|
||||
check_argument('datasets', c, restricted=True, val_type=list)
|
||||
for dataset_entry in c['datasets']:
|
||||
check_argument('name', dataset_entry, restricted=True, val_type=str)
|
||||
check_argument('path', dataset_entry, restricted=True, val_type=str)
|
||||
check_argument('meta_file_train', dataset_entry, restricted=True, val_type=[str, list])
|
||||
check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str)
|
|
@ -1,44 +1,13 @@
|
|||
import os
|
||||
import json
|
||||
import re
|
||||
import torch
|
||||
import datetime
|
||||
|
||||
|
||||
class AttrDict(dict):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(AttrDict, self).__init__(*args, **kwargs)
|
||||
self.__dict__ = self
|
||||
|
||||
|
||||
def load_config(config_path):
|
||||
config = AttrDict()
|
||||
with open(config_path, "r") as f:
|
||||
input_str = f.read()
|
||||
input_str = re.sub(r'\\\n', '', input_str)
|
||||
input_str = re.sub(r'//.*\n', '\n', input_str)
|
||||
data = json.loads(input_str)
|
||||
config.update(data)
|
||||
return config
|
||||
|
||||
|
||||
def copy_config_file(config_file, out_path, new_fields):
|
||||
config_lines = open(config_file, "r").readlines()
|
||||
# add extra information fields
|
||||
for key, value in new_fields.items():
|
||||
if isinstance(value, str):
|
||||
new_line = '"{}":"{}",\n'.format(key, value)
|
||||
else:
|
||||
new_line = '"{}":{},\n'.format(key, value)
|
||||
config_lines.insert(1, new_line)
|
||||
config_out_file = open(out_path, "w")
|
||||
config_out_file.writelines(config_lines)
|
||||
config_out_file.close()
|
||||
|
||||
|
||||
def load_checkpoint(model, checkpoint_path, use_cuda=False):
|
||||
def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False):
|
||||
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
||||
model.load_state_dict(state['model'])
|
||||
if amp and 'amp' in state:
|
||||
amp.load_state_dict(state['amp'])
|
||||
if use_cuda:
|
||||
model.cuda()
|
||||
# set model stepsize
|
||||
|
@ -47,7 +16,7 @@ def load_checkpoint(model, checkpoint_path, use_cuda=False):
|
|||
return model, state
|
||||
|
||||
|
||||
def save_model(model, optimizer, current_step, epoch, r, output_path, **kwargs):
|
||||
def save_model(model, optimizer, current_step, epoch, r, output_path, amp_state_dict=None, **kwargs):
|
||||
new_state_dict = model.state_dict()
|
||||
state = {
|
||||
'model': new_state_dict,
|
||||
|
@ -57,6 +26,8 @@ def save_model(model, optimizer, current_step, epoch, r, output_path, **kwargs):
|
|||
'date': datetime.date.today().strftime("%B %d, %Y"),
|
||||
'r': r
|
||||
}
|
||||
if amp_state_dict:
|
||||
state['amp'] = amp_state_dict
|
||||
state.update(kwargs)
|
||||
torch.save(state, output_path)
|
||||
|
|
@ -1,6 +1,3 @@
|
|||
import torch
|
||||
|
||||
|
||||
def alignment_diagonal_score(alignments, binary=False):
|
||||
"""
|
||||
Compute how diagonal alignment predictions are. It is useful
|
|
@ -1,8 +1,6 @@
|
|||
import os
|
||||
import json
|
||||
|
||||
from TTS.datasets.preprocess import get_preprocessor_by_name
|
||||
|
||||
|
||||
def make_speakers_json_path(out_path):
|
||||
"""Returns conventional speakers.json location."""
|
||||
|
@ -12,12 +10,15 @@ def make_speakers_json_path(out_path):
|
|||
def load_speaker_mapping(out_path):
|
||||
"""Loads speaker mapping if already present."""
|
||||
try:
|
||||
with open(make_speakers_json_path(out_path)) as f:
|
||||
if os.path.splitext(out_path)[1] == '.json':
|
||||
json_file = out_path
|
||||
else:
|
||||
json_file = make_speakers_json_path(out_path)
|
||||
with open(json_file) as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
return {}
|
||||
|
||||
|
||||
def save_speaker_mapping(out_path, speaker_mapping):
|
||||
"""Saves speaker mapping if not yet present."""
|
||||
speakers_json_path = make_speakers_json_path(out_path)
|
|
@ -39,23 +39,23 @@ def numpy_to_tf(np_array, dtype):
|
|||
|
||||
def compute_style_mel(style_wav, ap, cuda=False):
|
||||
style_mel = torch.FloatTensor(ap.melspectrogram(
|
||||
ap.load_wav(style_wav))).unsqueeze(0)
|
||||
ap.load_wav(style_wav, sr=ap.sample_rate))).unsqueeze(0)
|
||||
if cuda:
|
||||
return style_mel.cuda()
|
||||
return style_mel
|
||||
|
||||
|
||||
def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None):
|
||||
def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None, speaker_embeddings=None):
|
||||
if CONFIG.use_gst:
|
||||
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
|
||||
inputs, style_mel=style_mel, speaker_ids=speaker_id)
|
||||
inputs, style_mel=style_mel, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings)
|
||||
else:
|
||||
if truncated:
|
||||
decoder_output, postnet_output, alignments, stop_tokens = model.inference_truncated(
|
||||
inputs, speaker_ids=speaker_id)
|
||||
inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings)
|
||||
else:
|
||||
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
|
||||
inputs, speaker_ids=speaker_id)
|
||||
inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings)
|
||||
return decoder_output, postnet_output, alignments, stop_tokens
|
||||
|
||||
|
||||
|
@ -140,6 +140,15 @@ def id_to_torch(speaker_id, cuda=False):
|
|||
return speaker_id
|
||||
|
||||
|
||||
def embedding_to_torch(speaker_embedding, cuda=False):
|
||||
if speaker_embedding is not None:
|
||||
speaker_embedding = np.asarray(speaker_embedding)
|
||||
speaker_embedding = torch.from_numpy(speaker_embedding).unsqueeze(0).type(torch.FloatTensor)
|
||||
if cuda:
|
||||
return speaker_embedding.cuda()
|
||||
return speaker_embedding
|
||||
|
||||
|
||||
# TODO: perform GL with pytorch for batching
|
||||
def apply_griffin_lim(inputs, input_lens, CONFIG, ap):
|
||||
'''Apply griffin-lim to each sample iterating throught the first dimension.
|
||||
|
@ -169,15 +178,16 @@ def synthesis(model,
|
|||
enable_eos_bos_chars=False, #pylint: disable=unused-argument
|
||||
use_griffin_lim=False,
|
||||
do_trim_silence=False,
|
||||
speaker_embedding=None,
|
||||
backend='torch'):
|
||||
"""Synthesize voice for the given text.
|
||||
|
||||
Args:
|
||||
model (TTS.models): model to synthesize.
|
||||
model (mozilla_voice_tts.tts.models): model to synthesize.
|
||||
text (str): target text
|
||||
CONFIG (dict): config dictionary to be loaded from config.json.
|
||||
use_cuda (bool): enable cuda.
|
||||
ap (TTS.utils.audio.AudioProcessor): audio processor to process
|
||||
ap (mozilla_voice_tts.tts.utils.audio.AudioProcessor): audio processor to process
|
||||
model outputs.
|
||||
speaker_id (int): id of speaker
|
||||
style_wav (str): Uses for style embedding of GST.
|
||||
|
@ -200,6 +210,10 @@ def synthesis(model,
|
|||
if backend == 'torch':
|
||||
if speaker_id is not None:
|
||||
speaker_id = id_to_torch(speaker_id, cuda=use_cuda)
|
||||
|
||||
if speaker_embedding is not None:
|
||||
speaker_embedding = embedding_to_torch(speaker_embedding, cuda=use_cuda)
|
||||
|
||||
if not isinstance(style_mel, dict):
|
||||
style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda)
|
||||
inputs = numpy_to_torch(inputs, torch.long, cuda=use_cuda)
|
||||
|
@ -216,7 +230,7 @@ def synthesis(model,
|
|||
# synthesize voice
|
||||
if backend == 'torch':
|
||||
decoder_output, postnet_output, alignments, stop_tokens = run_model_torch(
|
||||
model, inputs, CONFIG, truncated, speaker_id, style_mel)
|
||||
model, inputs, CONFIG, truncated, speaker_id, style_mel, speaker_embeddings=speaker_embedding)
|
||||
postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_torch(
|
||||
postnet_output, decoder_output, alignments, stop_tokens)
|
||||
elif backend == 'tf':
|
|
@ -4,10 +4,11 @@ import re
|
|||
from packaging import version
|
||||
import phonemizer
|
||||
from phonemizer.phonemize import phonemize
|
||||
from TTS.utils.text import cleaners
|
||||
from TTS.utils.text.symbols import make_symbols, symbols, phonemes, _phoneme_punctuations, _bos, \
|
||||
from mozilla_voice_tts.tts.utils.text import cleaners
|
||||
from mozilla_voice_tts.tts.utils.text.symbols import make_symbols, symbols, phonemes, _phoneme_punctuations, _bos, \
|
||||
_eos
|
||||
|
||||
# pylint: disable=unnecessary-comprehension
|
||||
# Mappings from symbol to numeric ID and vice versa:
|
||||
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
||||
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
|
@ -67,15 +67,16 @@ def remove_aux_symbols(text):
|
|||
text = re.sub(r'[\<\>\(\)\[\]\"]+', '', text)
|
||||
return text
|
||||
|
||||
|
||||
def replace_symbols(text):
|
||||
def replace_symbols(text, lang='en'):
|
||||
text = text.replace(';', ',')
|
||||
text = text.replace('-', ' ')
|
||||
text = text.replace(':', ',')
|
||||
text = text.replace('&', 'and')
|
||||
text = text.replace(':', ' ')
|
||||
if lang == 'en':
|
||||
text = text.replace('&', 'and')
|
||||
elif lang == 'pt':
|
||||
text = text.replace('&', ' e ')
|
||||
return text
|
||||
|
||||
|
||||
def basic_cleaners(text):
|
||||
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
|
||||
text = lowercase(text)
|
||||
|
@ -106,7 +107,6 @@ def basic_turkish_cleaners(text):
|
|||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def english_cleaners(text):
|
||||
'''Pipeline for English text, including number and abbreviation expansion.'''
|
||||
text = convert_to_ascii(text)
|
||||
|
@ -118,6 +118,14 @@ def english_cleaners(text):
|
|||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
def portuguese_cleaners(text):
|
||||
'''Basic pipeline for Portuguese text. There is no need to expand abbreviation and
|
||||
numbers, phonemizer already does that'''
|
||||
text = lowercase(text)
|
||||
text = replace_symbols(text, lang='pt')
|
||||
text = remove_aux_symbols(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
def phoneme_cleaners(text):
|
||||
'''Pipeline for phonemes mode, including number and abbreviation expansion.'''
|
|
@ -31,14 +31,13 @@ def _expand_dollars(m):
|
|||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
|
||||
elif dollars:
|
||||
if dollars:
|
||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||
return '%s %s' % (dollars, dollar_unit)
|
||||
elif cents:
|
||||
if cents:
|
||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||
return '%s %s' % (cents, cent_unit)
|
||||
else:
|
||||
return 'zero dollars'
|
||||
return 'zero dollars'
|
||||
|
||||
|
||||
def _expand_ordinal(m):
|
|
@ -3,10 +3,10 @@ import librosa
|
|||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
import matplotlib.pyplot as plt
|
||||
from TTS.utils.text import phoneme_to_sequence, sequence_to_phoneme
|
||||
from mozilla_voice_tts.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme
|
||||
|
||||
|
||||
def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None):
|
||||
def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None, output_fig=False):
|
||||
if isinstance(alignment, torch.Tensor):
|
||||
alignment_ = alignment.detach().cpu().numpy().squeeze()
|
||||
else:
|
||||
|
@ -24,10 +24,12 @@ def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None):
|
|||
plt.tight_layout()
|
||||
if title is not None:
|
||||
plt.title(title)
|
||||
if not output_fig:
|
||||
plt.close()
|
||||
return fig
|
||||
|
||||
|
||||
def plot_spectrogram(spectrogram, ap=None, fig_size=(16, 10)):
|
||||
def plot_spectrogram(spectrogram, ap=None, fig_size=(16, 10), output_fig=False):
|
||||
if isinstance(spectrogram, torch.Tensor):
|
||||
spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T
|
||||
else:
|
||||
|
@ -38,10 +40,12 @@ def plot_spectrogram(spectrogram, ap=None, fig_size=(16, 10)):
|
|||
plt.imshow(spectrogram_, aspect="auto", origin="lower")
|
||||
plt.colorbar()
|
||||
plt.tight_layout()
|
||||
if not output_fig:
|
||||
plt.close()
|
||||
return fig
|
||||
|
||||
|
||||
def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, decoder_output=None, output_path=None, figsize=(8, 24)):
|
||||
def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, decoder_output=None, output_path=None, figsize=(8, 24), output_fig=False):
|
||||
if decoder_output is not None:
|
||||
num_plot = 4
|
||||
else:
|
||||
|
@ -91,3 +95,6 @@ def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG,
|
|||
print(output_path)
|
||||
fig.savefig(output_path)
|
||||
plt.close()
|
||||
|
||||
if not output_fig:
|
||||
plt.close()
|
|
@ -1,10 +1,10 @@
|
|||
import librosa
|
||||
import soundfile as sf
|
||||
import numpy as np
|
||||
import scipy.io
|
||||
import scipy.io.wavfile
|
||||
import scipy.signal
|
||||
|
||||
from TTS.utils.data import StandardScaler
|
||||
from mozilla_voice_tts.tts.utils.data import StandardScaler
|
||||
|
||||
|
||||
class AudioProcessor(object):
|
||||
|
@ -52,7 +52,7 @@ class AudioProcessor(object):
|
|||
self.mel_fmin = mel_fmin or 0
|
||||
self.mel_fmax = mel_fmax
|
||||
self.spec_gain = float(spec_gain)
|
||||
self.stft_pad_mode = 'reflect'
|
||||
self.stft_pad_mode = stft_pad_mode
|
||||
self.max_norm = 1.0 if max_norm is None else float(max_norm)
|
||||
self.clip_norm = clip_norm
|
||||
self.do_trim_silence = do_trim_silence
|
||||
|
@ -123,7 +123,7 @@ class AudioProcessor(object):
|
|||
if self.symmetric_norm:
|
||||
S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
|
||||
if self.clip_norm:
|
||||
S_norm = np.clip(S_norm, -self.max_norm, self.max_norm)
|
||||
S_norm = np.clip(S_norm, -self.max_norm, self.max_norm) # pylint: disable=invalid-unary-operand-type
|
||||
return S_norm
|
||||
else:
|
||||
S_norm = self.max_norm * S_norm
|
||||
|
@ -148,7 +148,7 @@ class AudioProcessor(object):
|
|||
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
|
||||
if self.symmetric_norm:
|
||||
if self.clip_norm:
|
||||
S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
|
||||
S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm) #pylint: disable=invalid-unary-operand-type
|
||||
S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db
|
||||
return S_denorm + self.ref_level_db
|
||||
else:
|
|
@ -1,5 +1,5 @@
|
|||
import datetime
|
||||
from TTS.utils.io import AttrDict
|
||||
from mozilla_voice_tts.utils.io import AttrDict
|
||||
|
||||
|
||||
tcolors = AttrDict({
|
||||
|
@ -15,8 +15,8 @@ tcolors = AttrDict({
|
|||
|
||||
|
||||
class ConsoleLogger():
|
||||
# TODO: merge this with TTS ConsoleLogger
|
||||
def __init__(self):
|
||||
# TODO: color code for value changes
|
||||
# use these to compare values between iterations
|
||||
self.old_train_loss_dict = None
|
||||
self.old_epoch_loss_dict = None
|
||||
|
@ -35,8 +35,7 @@ class ConsoleLogger():
|
|||
def print_train_start(self):
|
||||
print(f"\n{tcolors.BOLD} > TRAINING ({self.get_time()}) {tcolors.ENDC}")
|
||||
|
||||
def print_train_step(self, batch_steps, step, global_step,
|
||||
step_time, loader_time, lrG, lrD,
|
||||
def print_train_step(self, batch_steps, step, global_step, log_dict,
|
||||
loss_dict, avg_loss_dict):
|
||||
indent = " | > "
|
||||
print()
|
||||
|
@ -48,7 +47,13 @@ class ConsoleLogger():
|
|||
log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}'])
|
||||
else:
|
||||
log_text += "{}{}: {:.5f} \n".format(indent, key, value)
|
||||
log_text += f"{indent}step_time: {step_time:.2f}\n{indent}loader_time: {loader_time:.2f}\n{indent}lrG: {lrG}\n{indent}lrD: {lrD}"
|
||||
for idx, (key, value) in enumerate(log_dict.items()):
|
||||
if isinstance(value, list):
|
||||
log_text += f"{indent}{key}: {value[0]:.{value[1]}f}"
|
||||
else:
|
||||
log_text += f"{indent}{key}: {value}"
|
||||
if idx < len(log_dict)-1:
|
||||
log_text += "\n"
|
||||
print(log_text, flush=True)
|
||||
|
||||
# pylint: disable=unused-argument
|
|
@ -0,0 +1,156 @@
|
|||
import os
|
||||
import glob
|
||||
import shutil
|
||||
import datetime
|
||||
import subprocess
|
||||
|
||||
|
||||
def get_git_branch():
|
||||
try:
|
||||
out = subprocess.check_output(["git", "branch"]).decode("utf8")
|
||||
current = next(line for line in out.split("\n")
|
||||
if line.startswith("*"))
|
||||
current.replace("* ", "")
|
||||
except subprocess.CalledProcessError:
|
||||
current = "inside_docker"
|
||||
return current
|
||||
|
||||
|
||||
def get_commit_hash():
|
||||
"""https://stackoverflow.com/questions/14989858/get-the-current-git-hash-in-a-python-script"""
|
||||
# try:
|
||||
# subprocess.check_output(['git', 'diff-index', '--quiet',
|
||||
# 'HEAD']) # Verify client is clean
|
||||
# except:
|
||||
# raise RuntimeError(
|
||||
# " !! Commit before training to get the commit hash.")
|
||||
try:
|
||||
commit = subprocess.check_output(
|
||||
['git', 'rev-parse', '--short', 'HEAD']).decode().strip()
|
||||
# Not copying .git folder into docker container
|
||||
except subprocess.CalledProcessError:
|
||||
commit = "0000000"
|
||||
print(' > Git Hash: {}'.format(commit))
|
||||
return commit
|
||||
|
||||
|
||||
def create_experiment_folder(root_path, model_name, debug):
|
||||
""" Create a folder with the current date and time """
|
||||
date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p")
|
||||
if debug:
|
||||
commit_hash = 'debug'
|
||||
else:
|
||||
commit_hash = get_commit_hash()
|
||||
output_folder = os.path.join(
|
||||
root_path, model_name + '-' + date_str + '-' + commit_hash)
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
print(" > Experiment folder: {}".format(output_folder))
|
||||
return output_folder
|
||||
|
||||
|
||||
def remove_experiment_folder(experiment_path):
|
||||
"""Check folder if there is a checkpoint, otherwise remove the folder"""
|
||||
|
||||
checkpoint_files = glob.glob(experiment_path + "/*.pth.tar")
|
||||
if not checkpoint_files:
|
||||
if os.path.exists(experiment_path):
|
||||
shutil.rmtree(experiment_path, ignore_errors=True)
|
||||
print(" ! Run is removed from {}".format(experiment_path))
|
||||
else:
|
||||
print(" ! Run is kept in {}".format(experiment_path))
|
||||
|
||||
|
||||
def count_parameters(model):
|
||||
r"""Count number of trainable parameters in a network"""
|
||||
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
|
||||
|
||||
def set_init_dict(model_dict, checkpoint_state, c):
|
||||
# Partial initialization: if there is a mismatch with new and old layer, it is skipped.
|
||||
for k, v in checkpoint_state.items():
|
||||
if k not in model_dict:
|
||||
print(" | > Layer missing in the model definition: {}".format(k))
|
||||
# 1. filter out unnecessary keys
|
||||
pretrained_dict = {
|
||||
k: v
|
||||
for k, v in checkpoint_state.items() if k in model_dict
|
||||
}
|
||||
# 2. filter out different size layers
|
||||
pretrained_dict = {
|
||||
k: v
|
||||
for k, v in pretrained_dict.items()
|
||||
if v.numel() == model_dict[k].numel()
|
||||
}
|
||||
# 3. skip reinit layers
|
||||
if c.reinit_layers is not None:
|
||||
for reinit_layer_name in c.reinit_layers:
|
||||
pretrained_dict = {
|
||||
k: v
|
||||
for k, v in pretrained_dict.items()
|
||||
if reinit_layer_name not in k
|
||||
}
|
||||
# 4. overwrite entries in the existing state dict
|
||||
model_dict.update(pretrained_dict)
|
||||
print(" | > {} / {} layers are restored.".format(len(pretrained_dict),
|
||||
len(model_dict)))
|
||||
return model_dict
|
||||
|
||||
class KeepAverage():
|
||||
def __init__(self):
|
||||
self.avg_values = {}
|
||||
self.iters = {}
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.avg_values[key]
|
||||
|
||||
def items(self):
|
||||
return self.avg_values.items()
|
||||
|
||||
def add_value(self, name, init_val=0, init_iter=0):
|
||||
self.avg_values[name] = init_val
|
||||
self.iters[name] = init_iter
|
||||
|
||||
def update_value(self, name, value, weighted_avg=False):
|
||||
if name not in self.avg_values:
|
||||
# add value if not exist before
|
||||
self.add_value(name, init_val=value)
|
||||
else:
|
||||
# else update existing value
|
||||
if weighted_avg:
|
||||
self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value
|
||||
self.iters[name] += 1
|
||||
else:
|
||||
self.avg_values[name] = self.avg_values[name] * \
|
||||
self.iters[name] + value
|
||||
self.iters[name] += 1
|
||||
self.avg_values[name] /= self.iters[name]
|
||||
|
||||
def add_values(self, name_dict):
|
||||
for key, value in name_dict.items():
|
||||
self.add_value(key, init_val=value)
|
||||
|
||||
def update_values(self, value_dict):
|
||||
for key, value in value_dict.items():
|
||||
self.update_value(key, value)
|
||||
|
||||
|
||||
def check_argument(name, c, enum_list=None, max_val=None, min_val=None, restricted=False, val_type=None, alternative=None):
|
||||
if alternative in c.keys() and c[alternative] is not None:
|
||||
return
|
||||
if restricted:
|
||||
assert name in c.keys(), f' [!] {name} not defined in config.json'
|
||||
if name in c.keys():
|
||||
if max_val:
|
||||
assert c[name] <= max_val, f' [!] {name} is larger than max value {max_val}'
|
||||
if min_val:
|
||||
assert c[name] >= min_val, f' [!] {name} is smaller than min value {min_val}'
|
||||
if enum_list:
|
||||
assert c[name].lower() in enum_list, f' [!] {name} is not a valid value'
|
||||
if isinstance(val_type, list):
|
||||
is_valid = False
|
||||
for typ in val_type:
|
||||
if isinstance(c[name], typ):
|
||||
is_valid = True
|
||||
assert is_valid or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'
|
||||
elif val_type:
|
||||
assert isinstance(c[name], val_type) or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}'
|
|
@ -0,0 +1,32 @@
|
|||
import re
|
||||
import json
|
||||
|
||||
class AttrDict(dict):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(AttrDict, self).__init__(*args, **kwargs)
|
||||
self.__dict__ = self
|
||||
|
||||
|
||||
def load_config(config_path):
|
||||
config = AttrDict()
|
||||
with open(config_path, "r") as f:
|
||||
input_str = f.read()
|
||||
input_str = re.sub(r'\\\n', '', input_str)
|
||||
input_str = re.sub(r'//.*\n', '\n', input_str)
|
||||
data = json.loads(input_str)
|
||||
config.update(data)
|
||||
return config
|
||||
|
||||
|
||||
def copy_config_file(config_file, out_path, new_fields):
|
||||
config_lines = open(config_file, "r").readlines()
|
||||
# add extra information fields
|
||||
for key, value in new_fields.items():
|
||||
if isinstance(value, str):
|
||||
new_line = '"{}":"{}",\n'.format(key, value)
|
||||
else:
|
||||
new_line = '"{}":{},\n'.format(key, value)
|
||||
config_lines.insert(1, new_line)
|
||||
config_out_file = open(out_path, "w")
|
||||
config_out_file.writelines(config_lines)
|
||||
config_out_file.close()
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
import math
|
||||
import torch
|
||||
from torch.optim.optimizer import Optimizer, required
|
||||
from torch.optim.optimizer import Optimizer
|
||||
|
||||
|
||||
class RAdam(Optimizer):
|
||||
|
@ -25,7 +25,7 @@ class RAdam(Optimizer):
|
|||
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)])
|
||||
super(RAdam, self).__init__(params, defaults)
|
||||
|
||||
def __setstate__(self, state):
|
||||
def __setstate__(self, state): # pylint: disable=useless-super-delegation
|
||||
super(RAdam, self).__setstate__(state)
|
||||
|
||||
def step(self, closure=None):
|
|
@ -47,7 +47,7 @@ class TensorboardLogger(object):
|
|||
for key, value in audios.items():
|
||||
try:
|
||||
self.writer.add_audio('{}/{}'.format(scope_name, key), value, step, sample_rate=sample_rate)
|
||||
except:
|
||||
except RuntimeError:
|
||||
traceback.print_exc()
|
||||
|
||||
def tb_train_iter_stats(self, step, stats):
|
|
@ -13,13 +13,21 @@ def setup_torch_training_env(cudnn_enable, cudnn_benchmark):
|
|||
return use_cuda, num_gpus
|
||||
|
||||
|
||||
def check_update(model, grad_clip, ignore_stopnet=False):
|
||||
def check_update(model, grad_clip, ignore_stopnet=False, amp_opt_params=None):
|
||||
r'''Check model gradient against unexpected jumps and failures'''
|
||||
skip_flag = False
|
||||
if ignore_stopnet:
|
||||
grad_norm = torch.nn.utils.clip_grad_norm_([param for name, param in model.named_parameters() if 'stopnet' not in name], grad_clip)
|
||||
if not amp_opt_params:
|
||||
grad_norm = torch.nn.utils.clip_grad_norm_(
|
||||
[param for name, param in model.named_parameters() if 'stopnet' not in name], grad_clip)
|
||||
else:
|
||||
grad_norm = torch.nn.utils.clip_grad_norm_(amp_opt_params, grad_clip)
|
||||
else:
|
||||
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
|
||||
if not amp_opt_params:
|
||||
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
|
||||
else:
|
||||
grad_norm = torch.nn.utils.clip_grad_norm_(amp_opt_params, grad_clip)
|
||||
|
||||
# compatibility with different torch versions
|
||||
if isinstance(grad_norm, float):
|
||||
if np.isinf(grad_norm):
|
|
@ -0,0 +1,39 @@
|
|||
# Mozilla TTS Vocoders (Experimental)
|
||||
|
||||
Here there are vocoder model implementations which can be combined with the other TTS models.
|
||||
|
||||
Currently, following models are implemented:
|
||||
|
||||
- Melgan
|
||||
- MultiBand-Melgan
|
||||
- ParallelWaveGAN
|
||||
- GAN-TTS (Discriminator Only)
|
||||
|
||||
It is also very easy to adapt different vocoder models as we provide a flexible and modular (but not too modular) framework.
|
||||
|
||||
## Training a model
|
||||
|
||||
You can see here an example (Soon)[Colab Notebook]() training MelGAN with LJSpeech dataset.
|
||||
|
||||
In order to train a new model, you need to gather all wav files into a folder and give this folder to `data_path` in '''config.json'''
|
||||
|
||||
You need to define other relevant parameters in your ```config.json``` and then start traning with the following command.
|
||||
|
||||
```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --config_path path/to/config.json```
|
||||
|
||||
Example config files can be found under `tts/vocoder/configs/` folder.
|
||||
|
||||
You can continue a previous training run by the following command.
|
||||
|
||||
```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --continue_path path/to/your/model/folder```
|
||||
|
||||
You can fine-tune a pre-trained model by the following command.
|
||||
|
||||
```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth.tar```
|
||||
|
||||
Restoring a model starts a new training in a different folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same directory where the previous training run left off.
|
||||
|
||||
You can also follow your training runs on Tensorboard as you do with our TTS models.
|
||||
|
||||
## Acknowledgement
|
||||
Thanks to @kan-bayashi for his [repository](https://github.com/kan-bayashi/ParallelWaveGAN) being the start point of our work.
|
|
@ -0,0 +1,144 @@
|
|||
{
|
||||
"run_name": "multiband-melgan",
|
||||
"run_description": "multiband melgan mean-var scaling",
|
||||
|
||||
// AUDIO PARAMETERS
|
||||
"audio":{
|
||||
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"win_length": 1024, // stft window length in ms.
|
||||
"hop_length": 256, // stft window hop-lengh in ms.
|
||||
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
||||
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
||||
|
||||
// Audio processing parameters
|
||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
||||
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"ref_level_db": 0, // reference level db, theoretically 20db is the sound of air.
|
||||
|
||||
// Silence trimming
|
||||
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
||||
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
|
||||
|
||||
// MelSpectrogram parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.
|
||||
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
|
||||
"min_level_db": -100, // lower bound for normalization
|
||||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"stats_path": "/home/erogol/Data/MozillaMerged22050/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
||||
},
|
||||
|
||||
// DISTRIBUTED TRAINING
|
||||
// "distributed":{
|
||||
// "backend": "nccl",
|
||||
// "url": "tcp:\/\/localhost:54321"
|
||||
// },
|
||||
|
||||
// MODEL PARAMETERS
|
||||
"use_pqmf": true,
|
||||
|
||||
// LOSS PARAMETERS
|
||||
"use_stft_loss": true,
|
||||
"use_subband_stft_loss": true,
|
||||
"use_mse_gan_loss": true,
|
||||
"use_hinge_gan_loss": false,
|
||||
"use_feat_match_loss": false, // use only with melgan discriminators
|
||||
|
||||
// loss weights
|
||||
"stft_loss_weight": 0.5,
|
||||
"subband_stft_loss_weight": 0.5,
|
||||
"mse_G_loss_weight": 2.5,
|
||||
"hinge_G_loss_weight": 2.5,
|
||||
"feat_match_loss_weight": 25,
|
||||
|
||||
// multiscale stft loss parameters
|
||||
"stft_loss_params": {
|
||||
"n_ffts": [1024, 2048, 512],
|
||||
"hop_lengths": [120, 240, 50],
|
||||
"win_lengths": [600, 1200, 240]
|
||||
},
|
||||
|
||||
// subband multiscale stft loss parameters
|
||||
"subband_stft_loss_params":{
|
||||
"n_ffts": [384, 683, 171],
|
||||
"hop_lengths": [30, 60, 10],
|
||||
"win_lengths": [150, 300, 60]
|
||||
},
|
||||
|
||||
"target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch
|
||||
|
||||
// DISCRIMINATOR
|
||||
"discriminator_model": "melgan_multiscale_discriminator",
|
||||
"discriminator_model_params":{
|
||||
"base_channels": 16,
|
||||
"max_channels":512,
|
||||
"downsample_factors":[4, 4, 4]
|
||||
},
|
||||
"steps_to_start_discriminator": 200000, // steps required to start GAN trainining.1
|
||||
|
||||
// GENERATOR
|
||||
"generator_model": "multiband_melgan_generator",
|
||||
"generator_model_params": {
|
||||
"upsample_factors":[8, 4, 2],
|
||||
"num_res_blocks": 4
|
||||
},
|
||||
|
||||
// DATASET
|
||||
"data_path": "/home/erogol/Data/MozillaMerged22050/wavs/",
|
||||
"feature_path": null,
|
||||
"seq_len": 16384,
|
||||
"pad_short": 2000,
|
||||
"conv_pad": 0,
|
||||
"use_noise_augment": false,
|
||||
"use_cache": true,
|
||||
|
||||
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
||||
|
||||
// TRAINING
|
||||
"batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
|
||||
// VALIDATION
|
||||
"run_eval": true,
|
||||
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
|
||||
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
||||
|
||||
// OPTIMIZER
|
||||
"epochs": 10000, // total number of epochs to train.
|
||||
"wd": 0.0, // Weight decay weight.
|
||||
"gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0
|
||||
"disc_clip_grad": -1, // Discriminator gradient clipping threshold.
|
||||
"lr_scheduler_gen": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
||||
"lr_scheduler_gen_params": {
|
||||
"gamma": 0.5,
|
||||
"milestones": [100000, 200000, 300000, 400000, 500000, 600000]
|
||||
},
|
||||
"lr_scheduler_disc": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
||||
"lr_scheduler_disc_params": {
|
||||
"gamma": 0.5,
|
||||
"milestones": [100000, 200000, 300000, 400000, 500000, 600000]
|
||||
},
|
||||
"lr_gen": 1e-4, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"lr_disc": 1e-4,
|
||||
|
||||
// TENSORBOARD and LOGGING
|
||||
"print_step": 25, // Number of steps to log traning on console.
|
||||
"print_eval": false, // If True, it prints loss values for each step in eval run.
|
||||
"save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
|
||||
// DATA LOADING
|
||||
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
||||
"eval_split_size": 10,
|
||||
|
||||
// PATHS
|
||||
"output_path": "/home/erogol/Models/Mozilla/"
|
||||
}
|
||||
|
|
@ -0,0 +1,143 @@
|
|||
{
|
||||
"run_name": "pwgan",
|
||||
"run_description": "parallel-wavegan training",
|
||||
|
||||
// AUDIO PARAMETERS
|
||||
"audio":{
|
||||
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"win_length": 1024, // stft window length in ms.
|
||||
"hop_length": 256, // stft window hop-lengh in ms.
|
||||
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
||||
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
||||
|
||||
// Audio processing parameters
|
||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
||||
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"ref_level_db": 0, // reference level db, theoretically 20db is the sound of air.
|
||||
|
||||
// Silence trimming
|
||||
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
||||
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
|
||||
|
||||
// MelSpectrogram parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.
|
||||
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
|
||||
"min_level_db": -100, // lower bound for normalization
|
||||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
||||
},
|
||||
|
||||
// DISTRIBUTED TRAINING
|
||||
// "distributed":{
|
||||
// "backend": "nccl",
|
||||
// "url": "tcp:\/\/localhost:54321"
|
||||
// },
|
||||
|
||||
// MODEL PARAMETERS
|
||||
"use_pqmf": true,
|
||||
|
||||
// LOSS PARAMETERS
|
||||
"use_stft_loss": true,
|
||||
"use_subband_stft_loss": false, // USE ONLY WITH MULTIBAND MODELS
|
||||
"use_mse_gan_loss": true,
|
||||
"use_hinge_gan_loss": false,
|
||||
"use_feat_match_loss": false, // use only with melgan discriminators
|
||||
|
||||
// loss weights
|
||||
"stft_loss_weight": 0.5,
|
||||
"subband_stft_loss_weight": 0.5,
|
||||
"mse_G_loss_weight": 2.5,
|
||||
"hinge_G_loss_weight": 2.5,
|
||||
"feat_match_loss_weight": 25,
|
||||
|
||||
// multiscale stft loss parameters
|
||||
"stft_loss_params": {
|
||||
"n_ffts": [1024, 2048, 512],
|
||||
"hop_lengths": [120, 240, 50],
|
||||
"win_lengths": [600, 1200, 240]
|
||||
},
|
||||
|
||||
// subband multiscale stft loss parameters
|
||||
"subband_stft_loss_params":{
|
||||
"n_ffts": [384, 683, 171],
|
||||
"hop_lengths": [30, 60, 10],
|
||||
"win_lengths": [150, 300, 60]
|
||||
},
|
||||
|
||||
"target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch
|
||||
|
||||
// DISCRIMINATOR
|
||||
"discriminator_model": "parallel_wavegan_discriminator",
|
||||
"discriminator_model_params":{
|
||||
"num_layers": 10
|
||||
},
|
||||
"steps_to_start_discriminator": 200000, // steps required to start GAN trainining.1
|
||||
|
||||
// GENERATOR
|
||||
"generator_model": "parallel_wavegan_generator",
|
||||
"generator_model_params": {
|
||||
"upsample_factors":[4, 4, 4, 4],
|
||||
"stacks": 3,
|
||||
"num_res_blocks": 30
|
||||
},
|
||||
|
||||
// DATASET
|
||||
"data_path": "/home/erogol/Data/LJSpeech-1.1/wavs/",
|
||||
"feature_path": null,
|
||||
"seq_len": 25600,
|
||||
"pad_short": 2000,
|
||||
"conv_pad": 0,
|
||||
"use_noise_augment": false,
|
||||
"use_cache": true,
|
||||
|
||||
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
||||
|
||||
// TRAINING
|
||||
"batch_size": 6, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
|
||||
// VALIDATION
|
||||
"run_eval": true,
|
||||
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
|
||||
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
||||
|
||||
// OPTIMIZER
|
||||
"epochs": 10000, // total number of epochs to train.
|
||||
"wd": 0.0, // Weight decay weight.
|
||||
"gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0
|
||||
"disc_clip_grad": -1, // Discriminator gradient clipping threshold.
|
||||
"lr_scheduler_gen": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
||||
"lr_scheduler_gen_params": {
|
||||
"gamma": 0.5,
|
||||
"milestones": [100000, 200000, 300000, 400000, 500000, 600000]
|
||||
},
|
||||
"lr_scheduler_disc": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
||||
"lr_scheduler_disc_params": {
|
||||
"gamma": 0.5,
|
||||
"milestones": [100000, 200000, 300000, 400000, 500000, 600000]
|
||||
},
|
||||
"lr_gen": 1e-4, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"lr_disc": 1e-4,
|
||||
|
||||
// TENSORBOARD and LOGGING
|
||||
"print_step": 25, // Number of steps to log traning on console.
|
||||
"print_eval": false, // If True, it prints loss values for each step in eval run.
|
||||
"save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
|
||||
// DATA LOADING
|
||||
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
||||
"eval_split_size": 10,
|
||||
|
||||
// PATHS
|
||||
"output_path": "/home/erogol/Models/LJSpeech/"
|
||||
}
|
||||
|
|
@ -306,4 +306,4 @@ class DiscriminatorLoss(nn.Module):
|
|||
loss += hinge_D_loss
|
||||
|
||||
return_dict['D_loss'] = loss
|
||||
return return_dict
|
||||
return return_dict
|
|
@ -0,0 +1,87 @@
|
|||
import torch
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
||||
class ResidualBlock(torch.nn.Module):
|
||||
"""Residual block module in WaveNet."""
|
||||
def __init__(self,
|
||||
kernel_size=3,
|
||||
res_channels=64,
|
||||
gate_channels=128,
|
||||
skip_channels=64,
|
||||
aux_channels=80,
|
||||
dropout=0.0,
|
||||
dilation=1,
|
||||
bias=True,
|
||||
use_causal_conv=False):
|
||||
super(ResidualBlock, self).__init__()
|
||||
self.dropout = dropout
|
||||
# no future time stamps available
|
||||
if use_causal_conv:
|
||||
padding = (kernel_size - 1) * dilation
|
||||
else:
|
||||
assert (kernel_size -
|
||||
1) % 2 == 0, "Not support even number kernel size."
|
||||
padding = (kernel_size - 1) // 2 * dilation
|
||||
self.use_causal_conv = use_causal_conv
|
||||
|
||||
# dilation conv
|
||||
self.conv = torch.nn.Conv1d(res_channels,
|
||||
gate_channels,
|
||||
kernel_size,
|
||||
padding=padding,
|
||||
dilation=dilation,
|
||||
bias=bias)
|
||||
|
||||
# local conditioning
|
||||
if aux_channels > 0:
|
||||
self.conv1x1_aux = torch.nn.Conv1d(aux_channels,
|
||||
gate_channels,
|
||||
1,
|
||||
bias=False)
|
||||
else:
|
||||
self.conv1x1_aux = None
|
||||
|
||||
# conv output is split into two groups
|
||||
gate_out_channels = gate_channels // 2
|
||||
self.conv1x1_out = torch.nn.Conv1d(gate_out_channels,
|
||||
res_channels,
|
||||
1,
|
||||
bias=bias)
|
||||
self.conv1x1_skip = torch.nn.Conv1d(gate_out_channels,
|
||||
skip_channels,
|
||||
1,
|
||||
bias=bias)
|
||||
|
||||
def forward(self, x, c):
|
||||
"""
|
||||
x: B x D_res x T
|
||||
c: B x D_aux x T
|
||||
"""
|
||||
residual = x
|
||||
x = F.dropout(x, p=self.dropout, training=self.training)
|
||||
x = self.conv(x)
|
||||
|
||||
# remove future time steps if use_causal_conv conv
|
||||
x = x[:, :, :residual.size(-1)] if self.use_causal_conv else x
|
||||
|
||||
# split into two part for gated activation
|
||||
splitdim = 1
|
||||
xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim)
|
||||
|
||||
# local conditioning
|
||||
if c is not None:
|
||||
assert self.conv1x1_aux is not None
|
||||
c = self.conv1x1_aux(c)
|
||||
ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim)
|
||||
xa, xb = xa + ca, xb + cb
|
||||
|
||||
x = torch.tanh(xa) * torch.sigmoid(xb)
|
||||
|
||||
# for skip connection
|
||||
s = self.conv1x1_skip(x)
|
||||
|
||||
# for residual connection
|
||||
x = (self.conv1x1_out(x) + residual) * (0.5**2)
|
||||
|
||||
return x, s
|
|
@ -0,0 +1,101 @@
|
|||
import torch
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
||||
class Stretch2d(torch.nn.Module):
|
||||
def __init__(self, x_scale, y_scale, mode="nearest"):
|
||||
super(Stretch2d, self).__init__()
|
||||
self.x_scale = x_scale
|
||||
self.y_scale = y_scale
|
||||
self.mode = mode
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
x (Tensor): Input tensor (B, C, F, T).
|
||||
Tensor: Interpolated tensor (B, C, F * y_scale, T * x_scale),
|
||||
"""
|
||||
return F.interpolate(
|
||||
x, scale_factor=(self.y_scale, self.x_scale), mode=self.mode)
|
||||
|
||||
|
||||
class UpsampleNetwork(torch.nn.Module):
|
||||
# pylint: disable=dangerous-default-value
|
||||
def __init__(self,
|
||||
upsample_factors,
|
||||
nonlinear_activation=None,
|
||||
nonlinear_activation_params={},
|
||||
interpolate_mode="nearest",
|
||||
freq_axis_kernel_size=1,
|
||||
use_causal_conv=False,
|
||||
):
|
||||
super(UpsampleNetwork, self).__init__()
|
||||
self.use_causal_conv = use_causal_conv
|
||||
self.up_layers = torch.nn.ModuleList()
|
||||
for scale in upsample_factors:
|
||||
# interpolation layer
|
||||
stretch = Stretch2d(scale, 1, interpolate_mode)
|
||||
self.up_layers += [stretch]
|
||||
|
||||
# conv layer
|
||||
assert (freq_axis_kernel_size - 1) % 2 == 0, "Not support even number freq axis kernel size."
|
||||
freq_axis_padding = (freq_axis_kernel_size - 1) // 2
|
||||
kernel_size = (freq_axis_kernel_size, scale * 2 + 1)
|
||||
if use_causal_conv:
|
||||
padding = (freq_axis_padding, scale * 2)
|
||||
else:
|
||||
padding = (freq_axis_padding, scale)
|
||||
conv = torch.nn.Conv2d(1, 1, kernel_size=kernel_size, padding=padding, bias=False)
|
||||
self.up_layers += [conv]
|
||||
|
||||
# nonlinear
|
||||
if nonlinear_activation is not None:
|
||||
nonlinear = getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)
|
||||
self.up_layers += [nonlinear]
|
||||
|
||||
def forward(self, c):
|
||||
"""
|
||||
c : (B, C, T_in).
|
||||
Tensor: (B, C, T_upsample)
|
||||
"""
|
||||
c = c.unsqueeze(1) # (B, 1, C, T)
|
||||
for f in self.up_layers:
|
||||
c = f(c)
|
||||
return c.squeeze(1) # (B, C, T')
|
||||
|
||||
|
||||
class ConvUpsample(torch.nn.Module):
|
||||
# pylint: disable=dangerous-default-value
|
||||
def __init__(self,
|
||||
upsample_factors,
|
||||
nonlinear_activation=None,
|
||||
nonlinear_activation_params={},
|
||||
interpolate_mode="nearest",
|
||||
freq_axis_kernel_size=1,
|
||||
aux_channels=80,
|
||||
aux_context_window=0,
|
||||
use_causal_conv=False
|
||||
):
|
||||
super(ConvUpsample, self).__init__()
|
||||
self.aux_context_window = aux_context_window
|
||||
self.use_causal_conv = use_causal_conv and aux_context_window > 0
|
||||
# To capture wide-context information in conditional features
|
||||
kernel_size = aux_context_window + 1 if use_causal_conv else 2 * aux_context_window + 1
|
||||
# NOTE(kan-bayashi): Here do not use padding because the input is already padded
|
||||
self.conv_in = torch.nn.Conv1d(aux_channels, aux_channels, kernel_size=kernel_size, bias=False)
|
||||
self.upsample = UpsampleNetwork(
|
||||
upsample_factors=upsample_factors,
|
||||
nonlinear_activation=nonlinear_activation,
|
||||
nonlinear_activation_params=nonlinear_activation_params,
|
||||
interpolate_mode=interpolate_mode,
|
||||
freq_axis_kernel_size=freq_axis_kernel_size,
|
||||
use_causal_conv=use_causal_conv,
|
||||
)
|
||||
|
||||
def forward(self, c):
|
||||
"""
|
||||
c : (B, C, T_in).
|
||||
Tensor: (B, C, T_upsampled),
|
||||
"""
|
||||
c_ = self.conv_in(c)
|
||||
c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_
|
||||
return self.upsample(c)
|
|
@ -2,7 +2,7 @@ import torch
|
|||
from torch import nn
|
||||
from torch.nn.utils import weight_norm
|
||||
|
||||
from TTS.vocoder.layers.melgan import ResidualStack
|
||||
from mozilla_voice_tts.vocoder.layers.melgan import ResidualStack
|
||||
|
||||
|
||||
class MelganGenerator(nn.Module):
|
||||
|
@ -95,4 +95,3 @@ class MelganGenerator(nn.Module):
|
|||
nn.utils.remove_weight_norm(layer)
|
||||
except ValueError:
|
||||
layer.remove_weight_norm()
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
from torch import nn
|
||||
|
||||
from TTS.vocoder.models.melgan_discriminator import MelganDiscriminator
|
||||
from mozilla_voice_tts.vocoder.models.melgan_discriminator import MelganDiscriminator
|
||||
|
||||
|
||||
class MelganMultiscaleDiscriminator(nn.Module):
|
||||
|
@ -38,4 +38,4 @@ class MelganMultiscaleDiscriminator(nn.Module):
|
|||
scores.append(score)
|
||||
feats.append(feat)
|
||||
x = self.pooling(x)
|
||||
return scores, feats
|
||||
return scores, feats
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue