mirror of https://github.com/coqui-ai/TTS.git
Merge remote-tracking branch 'TTS/dev' into dev
This commit is contained in:
commit
c96028f464
|
@ -1,2 +1,5 @@
|
||||||
linters:
|
linters:
|
||||||
- pylint:
|
- pylint:
|
||||||
|
# pylintrc: pylintrc
|
||||||
|
filefilter: ['- test_*.py', '+ *.py', '- *.npy']
|
||||||
|
# exclude:
|
|
@ -6,8 +6,9 @@ labels: ''
|
||||||
assignees: ''
|
assignees: ''
|
||||||
|
|
||||||
---
|
---
|
||||||
|
<b>Questions</b> will not be answered here!!
|
||||||
|
|
||||||
Please consider posting on [TTS Discourse](https://discourse.mozilla.org/c/tts) page, if your issue is not directly related to TTS development process.
|
Please consider posting on [TTS Discourse](https://discourse.mozilla.org/c/tts) page if your issue is not directly related to TTS development (Bugs, code updates etc.).
|
||||||
|
|
||||||
You can also check https://github.com/mozilla/TTS/wiki/FAQ for common questions and answers.
|
You can also check https://github.com/mozilla/TTS/wiki/FAQ for common questions and answers.
|
||||||
|
|
||||||
|
|
|
@ -157,7 +157,8 @@ disable=missing-docstring,
|
||||||
xreadlines-attribute,
|
xreadlines-attribute,
|
||||||
deprecated-sys-function,
|
deprecated-sys-function,
|
||||||
exception-escape,
|
exception-escape,
|
||||||
comprehension-escape
|
comprehension-escape,
|
||||||
|
duplicate-code
|
||||||
|
|
||||||
# Enable the message, report, category or checker with the given id(s). You can
|
# Enable the message, report, category or checker with the given id(s). You can
|
||||||
# either give multiple identifier separated by comma (,) or put this option
|
# either give multiple identifier separated by comma (,) or put this option
|
||||||
|
|
|
@ -3,6 +3,10 @@ language: python
|
||||||
git:
|
git:
|
||||||
quiet: true
|
quiet: true
|
||||||
|
|
||||||
|
before_install:
|
||||||
|
- sudo apt-get update
|
||||||
|
- sudo apt-get -y install espeak
|
||||||
|
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- name: "Lint check"
|
- name: "Lint check"
|
||||||
|
|
|
@ -12,7 +12,9 @@ fi
|
||||||
if [[ "$TEST_SUITE" == "unittest" ]]; then
|
if [[ "$TEST_SUITE" == "unittest" ]]; then
|
||||||
# Run tests on all pushes
|
# Run tests on all pushes
|
||||||
pushd tts_namespace
|
pushd tts_namespace
|
||||||
python -m unittest
|
nosetests TTS.speaker_encoder.tests --nocapture
|
||||||
|
nosetests TTS.vocoder.tests --nocapture
|
||||||
|
nosetests TTS.tests --nocapture
|
||||||
popd
|
popd
|
||||||
# Test server package
|
# Test server package
|
||||||
./tests/test_server_package.sh
|
./tests/test_server_package.sh
|
||||||
|
|
64
README.md
64
README.md
|
@ -3,25 +3,49 @@
|
||||||
|
|
||||||
<img src="https://travis-ci.org/mozilla/TTS.svg?branch=dev"/>
|
<img src="https://travis-ci.org/mozilla/TTS.svg?branch=dev"/>
|
||||||
|
|
||||||
This project is a part of [Mozilla Common Voice](https://voice.mozilla.org/en). TTS aims a deep learning based Text2Speech engine, low in cost and high in quality. To begin with, you can hear a sample generated voice from [here](https://soundcloud.com/user-565970875/commonvoice-loc-sens-attn).
|
This project is a part of [Mozilla Common Voice](https://voice.mozilla.org/en). TTS aims a deep learning based Text2Speech engine, low in cost and high in quality. To begin with, you can hear a sample synthesized voice from [here](https://soundcloud.com/user-565970875/commonvoice-loc-sens-attn).
|
||||||
|
|
||||||
TTS includes two different model implementations which are based on [Tacotron](https://arxiv.org/abs/1703.10135) and [Tacotron2](https://arxiv.org/abs/1712.05884). Tacotron is smaller, efficient and easier to train but Tacotron2 provides better results, especially when it is combined with a Neural vocoder. Therefore, choose depending on your project requirements.
|
|
||||||
|
|
||||||
If you are new, you can also find [here](http://www.erogol.com/text-speech-deep-learning-architectures/) a brief post about TTS architectures and their comparisons.
|
If you are new, you can also find [here](http://www.erogol.com/text-speech-deep-learning-architectures/) a brief post about TTS architectures and their comparisons.
|
||||||
|
|
||||||
|
[](https://sourcerer.io/fame/erogol/erogol/TTS/links/0)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/1)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/2)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/3)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/4)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/5)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/6)[](https://sourcerer.io/fame/erogol/erogol/TTS/links/7)
|
||||||
|
|
||||||
## TTS Performance
|
## TTS Performance
|
||||||
<p align="center"><img src="https://camo.githubusercontent.com/9fa79f977015e55eb9ec7aa32045555f60d093d3/68747470733a2f2f646973636f757273652d706161732d70726f64756374696f6e2d636f6e74656e742e73332e6475616c737461636b2e75732d656173742d312e616d617a6f6e6177732e636f6d2f6f7074696d697a65642f33582f362f342f363432386639383065396563373531633234386535393134363038393566373838316165633063365f325f363930783339342e706e67"/></p>
|
<p align="center"><img src="https://camo.githubusercontent.com/9fa79f977015e55eb9ec7aa32045555f60d093d3/68747470733a2f2f646973636f757273652d706161732d70726f64756374696f6e2d636f6e74656e742e73332e6475616c737461636b2e75732d656173742d312e616d617a6f6e6177732e636f6d2f6f7074696d697a65642f33582f362f342f363432386639383065396563373531633234386535393134363038393566373838316165633063365f325f363930783339342e706e67"/></p>
|
||||||
|
|
||||||
[Details...](https://github.com/mozilla/TTS/wiki/Mean-Opinion-Score-Results)
|
[Details...](https://github.com/mozilla/TTS/wiki/Mean-Opinion-Score-Results)
|
||||||
|
|
||||||
|
## Provided Models and Methods
|
||||||
|
Text-to-Spectrogram:
|
||||||
|
- Tacotron: [paper](https://arxiv.org/abs/1703.10135)
|
||||||
|
- Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
|
||||||
|
|
||||||
|
Attention Methods:
|
||||||
|
- Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
|
||||||
|
- Forward Backward Decoding: [paper](https://arxiv.org/abs/1907.09006)
|
||||||
|
- Graves Attention: [paper](https://arxiv.org/abs/1907.09006)
|
||||||
|
- Double Decoder Consistency: [blog](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/)
|
||||||
|
|
||||||
|
Speaker Encoder:
|
||||||
|
- GE2E: [paper](https://arxiv.org/abs/1710.10467)
|
||||||
|
|
||||||
|
Vocoders:
|
||||||
|
- MelGAN: [paper](https://arxiv.org/abs/1710.10467)
|
||||||
|
- MultiBandMelGAN: [paper](https://arxiv.org/abs/2005.05106)
|
||||||
|
- GAN-TTS discriminators: [paper](https://arxiv.org/abs/1909.11646)
|
||||||
|
|
||||||
|
You can also help us implement more models. Some TTS related work can be found [here](https://github.com/erogol/TTS-papers).
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
- High performance Text2Speech models on Torch and Tensorflow 2.0.
|
- High performance Deep Learning models for Text2Speech related tasks.
|
||||||
- High performance Speaker Encoder to compute speaker embeddings efficiently.
|
- Text2Speech models (Tacotron, Tacotron2).
|
||||||
- Integration with various Neural Vocoders (PWGAN, MelGAN, WaveRNN)
|
- Speaker Encoder to compute speaker embeddings efficiently.
|
||||||
- Released trained models.
|
- Vocoder models (MelGAN, Multiband-MelGAN, GAN-TTS)
|
||||||
- Efficient training codes for PyTorch. (soon for Tensorflow 2.0)
|
- Support for multi-speaker TTS training.
|
||||||
- Codes to convert Torch models to Tensorflow 2.0.
|
- Support for Multi-GPUs training.
|
||||||
- Detailed training anlaysis on console and Tensorboard.
|
- Ability to convert Torch models to Tensorflow 2.0 for inference.
|
||||||
|
- Released pre-trained models.
|
||||||
|
- Fast and efficient model training.
|
||||||
|
- Detailed training logs on console and Tensorboard.
|
||||||
- Tools to curate Text2Speech datasets under```dataset_analysis```.
|
- Tools to curate Text2Speech datasets under```dataset_analysis```.
|
||||||
- Demo server for model testing.
|
- Demo server for model testing.
|
||||||
- Notebooks for extensive model benchmarking.
|
- Notebooks for extensive model benchmarking.
|
||||||
|
@ -45,6 +69,22 @@ Or you can use ```requirements.txt``` to install the requirements only.
|
||||||
|
|
||||||
```pip install -r requirements.txt```
|
```pip install -r requirements.txt```
|
||||||
|
|
||||||
|
### Directory Structure
|
||||||
|
```
|
||||||
|
|- TTS/
|
||||||
|
| |- train.py (train your TTS model.)
|
||||||
|
| |- distribute.py (train your TTS model using Multiple GPUs)
|
||||||
|
| |- config.json (TTS model configuration file)
|
||||||
|
| |- tf/ (Tensorflow 2 utilities and model implementations)
|
||||||
|
| |- layers/ (model layer definitions)
|
||||||
|
| |- models/ (model definitions)
|
||||||
|
| |- notebooks/ (Jupyter Notebooks for model evaluation and parameter selection)
|
||||||
|
| |- data_analysis/ (TTS Dataset analysis tools and notebooks.)
|
||||||
|
| |- utils/ (TTS utilities -io, visualization, data processing etc.-)
|
||||||
|
| |- speaker_encoder/ (Speaker Encoder implementation with the same folder structure.)
|
||||||
|
| |- vocoder/ (Vocoder implementations with the same folder structure.)
|
||||||
|
```
|
||||||
|
|
||||||
### Docker
|
### Docker
|
||||||
A barebone `Dockerfile` exists at the root of the project, which should let you quickly setup the environment. By default, it will start the server and let you query it. Make sure to use `nvidia-docker` to use your GPUs. Make sure you follow the instructions in the [`server README`](server/README.md) before you build your image so that the server can find the model within the image.
|
A barebone `Dockerfile` exists at the root of the project, which should let you quickly setup the environment. By default, it will start the server and let you query it. Make sure to use `nvidia-docker` to use your GPUs. Make sure you follow the instructions in the [`server README`](server/README.md) before you build your image so that the server can find the model within the image.
|
||||||
|
|
||||||
|
@ -61,7 +101,7 @@ Below you see Tacotron model state after 16K iterations with batch-size 32 with
|
||||||
|
|
||||||
> "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning."
|
> "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning."
|
||||||
|
|
||||||
Audio examples: [https://soundcloud.com/user-565970875](https://soundcloud.com/user-565970875)
|
Audio examples: [soundcloud](https://soundcloud.com/user-565970875/pocket-article-wavernn-and-tacotron2)
|
||||||
|
|
||||||
<img src="images/example_model_output.png?raw=true" alt="example_output" width="400"/>
|
<img src="images/example_model_output.png?raw=true" alt="example_output" width="400"/>
|
||||||
|
|
||||||
|
@ -82,7 +122,7 @@ Audio length is approximately 6 secs.
|
||||||
|
|
||||||
|
|
||||||
## Datasets and Data-Loading
|
## Datasets and Data-Loading
|
||||||
TTS provides a generic dataloder easy to use for new datasets. You need to write an preprocessor function to integrate your own dataset.Check ```datasets/preprocess.py``` to see some examples. After the function, you need to set ```dataset``` field in ```config.json```. Do not forget other data related fields too.
|
TTS provides a generic dataloader easy to use for new datasets. You need to write an preprocessor function to integrate your own dataset.Check ```datasets/preprocess.py``` to see some examples. After the function, you need to set ```dataset``` field in ```config.json```. Do not forget other data related fields too.
|
||||||
|
|
||||||
Some of the open-sourced datasets that we successfully applied TTS, are linked below.
|
Some of the open-sourced datasets that we successfully applied TTS, are linked below.
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ import numpy as np
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from TTS.datasets.preprocess import load_meta_data
|
from TTS.datasets.preprocess import load_meta_data
|
||||||
from TTS.utils.generic_utils import load_config
|
from TTS.utils.io import load_config
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -63,6 +63,11 @@ def main():
|
||||||
stats['linear_mean'] = linear_mean
|
stats['linear_mean'] = linear_mean
|
||||||
stats['linear_std'] = linear_scale
|
stats['linear_std'] = linear_scale
|
||||||
|
|
||||||
|
print(f' > Avg mel spec mean: {mel_mean.mean()}')
|
||||||
|
print(f' > Avg mel spec scale: {mel_scale.mean()}')
|
||||||
|
print(f' > Avg linear spec mean: {linear_mean.mean()}')
|
||||||
|
print(f' > Avg lienar spec scale: {linear_scale.mean()}')
|
||||||
|
|
||||||
# set default config values for mean-var scaling
|
# set default config values for mean-var scaling
|
||||||
CONFIG.audio['stats_path'] = output_file_path
|
CONFIG.audio['stats_path'] = output_file_path
|
||||||
CONFIG.audio['signal_norm'] = True
|
CONFIG.audio['signal_norm'] = True
|
||||||
|
@ -73,6 +78,7 @@ def main():
|
||||||
del CONFIG.audio['clip_norm']
|
del CONFIG.audio['clip_norm']
|
||||||
stats['audio_config'] = CONFIG.audio
|
stats['audio_config'] = CONFIG.audio
|
||||||
np.save(output_file_path, stats, allow_pickle=True)
|
np.save(output_file_path, stats, allow_pickle=True)
|
||||||
|
print(f' > scale_stats.npy is saved to {output_file_path}')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
25
config.json
25
config.json
|
@ -1,24 +1,24 @@
|
||||||
{
|
{
|
||||||
"model": "Tacotron2",
|
"model": "Tacotron2",
|
||||||
"run_name": "ljspeech",
|
"run_name": "ljspeech-ddc-bn",
|
||||||
"run_description": "tacotron2",
|
"run_description": "tacotron2 with ddc and batch-normalization",
|
||||||
|
|
||||||
// AUDIO PARAMETERS
|
// AUDIO PARAMETERS
|
||||||
"audio":{
|
"audio":{
|
||||||
// stft parameters
|
// stft parameters
|
||||||
"num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame.
|
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||||
"win_length": 1024, // stft window length in ms.
|
"win_length": 1024, // stft window length in ms.
|
||||||
"hop_length": 256, // stft window hop-lengh in ms.
|
"hop_length": 256, // stft window hop-lengh in ms.
|
||||||
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
||||||
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
||||||
|
|
||||||
// Audio processing parameters
|
// Audio processing parameters
|
||||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate.
|
||||||
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||||
|
|
||||||
// Silence trimming
|
// Silence trimming
|
||||||
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
|
||||||
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
|
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
|
||||||
|
|
||||||
// Griffin-Lim
|
// Griffin-Lim
|
||||||
|
@ -83,27 +83,30 @@
|
||||||
|
|
||||||
// TACOTRON PRENET
|
// TACOTRON PRENET
|
||||||
"memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
|
"memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
|
||||||
"prenet_type": "original", // "original" or "bn".
|
"prenet_type": "bn", // "original" or "bn".
|
||||||
"prenet_dropout": true, // enable/disable dropout at prenet.
|
"prenet_dropout": false, // enable/disable dropout at prenet.
|
||||||
|
|
||||||
// ATTENTION
|
// ATTENTION
|
||||||
"attention_type": "original", // 'original' or 'graves'
|
"attention_type": "original", // 'original' or 'graves'
|
||||||
"attention_heads": 4, // number of attention heads (only for 'graves')
|
"attention_heads": 4, // number of attention heads (only for 'graves')
|
||||||
"attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
|
"attention_norm": "sigmoid", // softmax or sigmoid.
|
||||||
"windowing": false, // Enables attention windowing. Used only in eval mode.
|
"windowing": false, // Enables attention windowing. Used only in eval mode.
|
||||||
"use_forward_attn": false, // if it uses forward attention. In general, it aligns faster.
|
"use_forward_attn": false, // if it uses forward attention. In general, it aligns faster.
|
||||||
"forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode.
|
"forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode.
|
||||||
"transition_agent": false, // enable/disable transition agent of forward attention.
|
"transition_agent": false, // enable/disable transition agent of forward attention.
|
||||||
"location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
|
"location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
|
||||||
"bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
|
"bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
|
||||||
|
"double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
|
||||||
|
"ddc_r": 7, // reduction rate for coarse decoder.
|
||||||
|
|
||||||
// STOPNET
|
// STOPNET
|
||||||
"stopnet": true, // Train stopnet predicting the end of synthesis.
|
"stopnet": true, // Train stopnet predicting the end of synthesis.
|
||||||
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
|
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
|
||||||
|
|
||||||
// TENSORBOARD and LOGGING
|
// TENSORBOARD and LOGGING
|
||||||
"print_step": 25, // Number of steps to log traning on console.
|
"print_step": 25, // Number of steps to log training on console.
|
||||||
"print_eval": false, // If True, it prints loss values in evalulation.
|
"tb_plot_step:": 100, // Number of steps to plot TB training figures.
|
||||||
|
"print_eval": false, // If True, it prints intermediate loss values in evalulation.
|
||||||
"save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints.
|
"save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints.
|
||||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||||
|
@ -122,7 +125,7 @@
|
||||||
|
|
||||||
// PHONEMES
|
// PHONEMES
|
||||||
"phoneme_cache_path": "/media/erogol/data_ssd2/mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder.
|
"phoneme_cache_path": "/media/erogol/data_ssd2/mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||||
"use_phonemes": false, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||||
|
|
||||||
// MULTI-SPEAKER and GST
|
// MULTI-SPEAKER and GST
|
||||||
|
|
|
@ -1,134 +0,0 @@
|
||||||
{
|
|
||||||
"model": "Tacotron2", // one of the model in models/
|
|
||||||
"run_name": "ljspeech-stft_params",
|
|
||||||
"run_description": "tacotron2 cosntant stf parameters",
|
|
||||||
|
|
||||||
// AUDIO PARAMETERS
|
|
||||||
"audio":{
|
|
||||||
// Audio processing parameters
|
|
||||||
"num_mels": 80, // size of the mel spec frame.
|
|
||||||
"num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame.
|
|
||||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
|
||||||
"win_length": 1024, // stft window length in ms.
|
|
||||||
"hop_length": 256, // stft window hop-lengh in ms.
|
|
||||||
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
|
||||||
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
|
||||||
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
|
||||||
"min_level_db": -100, // normalization range
|
|
||||||
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
|
||||||
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
|
||||||
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
|
||||||
// Normalization parameters
|
|
||||||
"signal_norm": true, // normalize the spec values in range [0, 1]
|
|
||||||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
|
||||||
"max_norm": 1.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
|
||||||
"clip_norm": true, // clip normalized values into the range.
|
|
||||||
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
|
||||||
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
|
||||||
"do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
|
||||||
"trim_db": 60 // threshold for timming silence. Set this according to your dataset.
|
|
||||||
},
|
|
||||||
|
|
||||||
// VOCABULARY PARAMETERS
|
|
||||||
// if custom character set is not defined,
|
|
||||||
// default set in symbols.py is used
|
|
||||||
"characters":{
|
|
||||||
"pad": "_",
|
|
||||||
"eos": "~",
|
|
||||||
"bos": "^",
|
|
||||||
"characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
|
|
||||||
"punctuations":"!'(),-.:;? ",
|
|
||||||
"phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
|
|
||||||
},
|
|
||||||
|
|
||||||
// DISTRIBUTED TRAINING
|
|
||||||
"distributed":{
|
|
||||||
"backend": "nccl",
|
|
||||||
"url": "tcp:\/\/localhost:54321"
|
|
||||||
},
|
|
||||||
|
|
||||||
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
|
||||||
|
|
||||||
// TRAINING
|
|
||||||
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
|
||||||
"eval_batch_size":16,
|
|
||||||
"r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
|
|
||||||
"gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
|
|
||||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
|
||||||
"ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled.
|
|
||||||
|
|
||||||
// VALIDATION
|
|
||||||
"run_eval": true,
|
|
||||||
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
|
|
||||||
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
|
||||||
|
|
||||||
// OPTIMIZER
|
|
||||||
"noam_schedule": false, // use noam warmup and lr schedule.
|
|
||||||
"grad_clip": 1.0, // upper limit for gradients for clipping.
|
|
||||||
"epochs": 1000, // total number of epochs to train.
|
|
||||||
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
|
||||||
"wd": 0.000001, // Weight decay weight.
|
|
||||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
|
||||||
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
|
|
||||||
|
|
||||||
// TACOTRON PRENET
|
|
||||||
"memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
|
|
||||||
"prenet_type": "original", // "original" or "bn".
|
|
||||||
"prenet_dropout": true, // enable/disable dropout at prenet.
|
|
||||||
|
|
||||||
// ATTENTION
|
|
||||||
"attention_type": "original", // 'original' or 'graves'
|
|
||||||
"attention_heads": 4, // number of attention heads (only for 'graves')
|
|
||||||
"attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
|
|
||||||
"windowing": false, // Enables attention windowing. Used only in eval mode.
|
|
||||||
"use_forward_attn": false, // if it uses forward attention. In general, it aligns faster.
|
|
||||||
"forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode.
|
|
||||||
"transition_agent": false, // enable/disable transition agent of forward attention.
|
|
||||||
"location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
|
|
||||||
"bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
|
|
||||||
|
|
||||||
// STOPNET
|
|
||||||
"stopnet": true, // Train stopnet predicting the end of synthesis.
|
|
||||||
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
|
|
||||||
|
|
||||||
// TENSORBOARD and LOGGING
|
|
||||||
"print_step": 25, // Number of steps to log traning on console.
|
|
||||||
"save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints.
|
|
||||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
|
||||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
|
||||||
|
|
||||||
// DATA LOADING
|
|
||||||
"text_cleaner": "phoneme_cleaners",
|
|
||||||
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
|
|
||||||
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
|
||||||
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
|
||||||
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
|
|
||||||
"min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training
|
|
||||||
"max_seq_len": 153, // DATASET-RELATED: maximum text length
|
|
||||||
|
|
||||||
// PATHS
|
|
||||||
"output_path": "/data4/rw/home/Trainings/",
|
|
||||||
|
|
||||||
// PHONEMES
|
|
||||||
"phoneme_cache_path": "mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder.
|
|
||||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
|
||||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
|
||||||
|
|
||||||
// MULTI-SPEAKER and GST
|
|
||||||
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
|
|
||||||
"style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference.
|
|
||||||
"use_gst": false, // TACOTRON ONLY: use global style tokens
|
|
||||||
|
|
||||||
// DATASETS
|
|
||||||
"datasets": // List of datasets. They all merged and they get different speaker_ids.
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"name": "ljspeech",
|
|
||||||
"path": "/root/LJSpeech-1.1/",
|
|
||||||
"meta_file_train": "metadata.csv",
|
|
||||||
"meta_file_val": null
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
|
@ -92,7 +92,7 @@ class MyDataset(Dataset):
|
||||||
return phonemes
|
return phonemes
|
||||||
|
|
||||||
def _load_or_generate_phoneme_sequence(self, wav_file, text):
|
def _load_or_generate_phoneme_sequence(self, wav_file, text):
|
||||||
file_name = os.path.basename(wav_file).split('.')[0]
|
file_name = os.path.splitext(os.path.basename(wav_file))[0]
|
||||||
cache_path = os.path.join(self.phoneme_cache_path,
|
cache_path = os.path.join(self.phoneme_cache_path,
|
||||||
file_name + '_phoneme.npy')
|
file_name + '_phoneme.npy')
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -120,7 +120,7 @@ def mailabs(root_path, meta_files=None):
|
||||||
text = cols[1].strip()
|
text = cols[1].strip()
|
||||||
items.append([text, wav_file, speaker_name])
|
items.append([text, wav_file, speaker_name])
|
||||||
else:
|
else:
|
||||||
raise RuntimeError("> File %s is not exist!"%(wav_file))
|
raise RuntimeError("> File %s does not exist!"%(wav_file))
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
@ -185,7 +185,7 @@ def libri_tts(root_path, meta_files=None):
|
||||||
text = cols[1]
|
text = cols[1]
|
||||||
items.append([text, wav_file, speaker_name])
|
items.append([text, wav_file, speaker_name])
|
||||||
for item in items:
|
for item in items:
|
||||||
assert os.path.exists(item[1]), f" [!] wav file is not exist - {item[1]}"
|
assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}"
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
@ -203,5 +203,5 @@ def custom_turkish(root_path, meta_file):
|
||||||
continue
|
continue
|
||||||
text = cols[1].strip()
|
text = cols[1].strip()
|
||||||
items.append([text, wav_file, speaker_name])
|
items.append([text, wav_file, speaker_name])
|
||||||
print(f" [!] {len(skipped_files)} files skipped. They are not exist...")
|
print(f" [!] {len(skipped_files)} files skipped. They don't exist...")
|
||||||
return items
|
return items
|
||||||
|
|
|
@ -1,24 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
# import torch
|
|
||||||
# from torch import nn
|
|
||||||
|
|
||||||
# class StopProjection(nn.Module):
|
|
||||||
# r""" Simple projection layer to predict the "stop token"
|
|
||||||
|
|
||||||
# Args:
|
|
||||||
# in_features (int): size of the input vector
|
|
||||||
# out_features (int or list): size of each output vector. aka number
|
|
||||||
# of predicted frames.
|
|
||||||
# """
|
|
||||||
|
|
||||||
# def __init__(self, in_features, out_features):
|
|
||||||
# super(StopProjection, self).__init__()
|
|
||||||
# self.linear = nn.Linear(in_features, out_features)
|
|
||||||
# self.dropout = nn.Dropout(0.5)
|
|
||||||
# self.sigmoid = nn.Sigmoid()
|
|
||||||
|
|
||||||
# def forward(self, inputs):
|
|
||||||
# out = self.dropout(inputs)
|
|
||||||
# out = self.linear(out)
|
|
||||||
# out = self.sigmoid(out)
|
|
||||||
# return out
|
|
|
@ -184,7 +184,7 @@ class TacotronLoss(torch.nn.Module):
|
||||||
|
|
||||||
def forward(self, postnet_output, decoder_output, mel_input, linear_input,
|
def forward(self, postnet_output, decoder_output, mel_input, linear_input,
|
||||||
stopnet_output, stopnet_target, output_lens, decoder_b_output,
|
stopnet_output, stopnet_target, output_lens, decoder_b_output,
|
||||||
alignments, alignment_lens, input_lens):
|
alignments, alignment_lens, alignments_backwards, input_lens):
|
||||||
|
|
||||||
return_dict = {}
|
return_dict = {}
|
||||||
# decoder and postnet losses
|
# decoder and postnet losses
|
||||||
|
@ -226,6 +226,15 @@ class TacotronLoss(torch.nn.Module):
|
||||||
return_dict['decoder_b_loss'] = decoder_b_loss
|
return_dict['decoder_b_loss'] = decoder_b_loss
|
||||||
return_dict['decoder_c_loss'] = decoder_c_loss
|
return_dict['decoder_c_loss'] = decoder_c_loss
|
||||||
|
|
||||||
|
# double decoder consistency loss (if enabled)
|
||||||
|
if self.config.double_decoder_consistency:
|
||||||
|
decoder_b_loss = self.criterion(decoder_b_output, mel_input, output_lens)
|
||||||
|
# decoder_c_loss = torch.nn.functional.l1_loss(decoder_b_output, decoder_output)
|
||||||
|
attention_c_loss = torch.nn.functional.l1_loss(alignments, alignments_backwards)
|
||||||
|
loss += decoder_b_loss + attention_c_loss
|
||||||
|
return_dict['decoder_coarse_loss'] = decoder_b_loss
|
||||||
|
return_dict['decoder_ddc_loss'] = attention_c_loss
|
||||||
|
|
||||||
# guided attention loss (if enabled)
|
# guided attention loss (if enabled)
|
||||||
if self.config.ga_alpha > 0:
|
if self.config.ga_alpha > 0:
|
||||||
ga_loss = self.criterion_ga(alignments, input_lens, alignment_lens)
|
ga_loss = self.criterion_ga(alignments, input_lens, alignment_lens)
|
||||||
|
|
|
@ -1,23 +1,21 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
import torch
|
import torch
|
||||||
import copy
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from TTS.layers.tacotron import Encoder, Decoder, PostCBHG
|
|
||||||
from TTS.utils.generic_utils import sequence_mask
|
|
||||||
from TTS.layers.gst_layers import GST
|
from TTS.layers.gst_layers import GST
|
||||||
|
from TTS.layers.tacotron import Decoder, Encoder, PostCBHG
|
||||||
|
from TTS.models.tacotron_abstract import TacotronAbstract
|
||||||
|
|
||||||
|
|
||||||
class Tacotron(nn.Module):
|
class Tacotron(TacotronAbstract):
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
num_chars,
|
num_chars,
|
||||||
num_speakers,
|
num_speakers,
|
||||||
r=5,
|
r=5,
|
||||||
postnet_output_dim=1025,
|
postnet_output_dim=1025,
|
||||||
decoder_output_dim=80,
|
decoder_output_dim=80,
|
||||||
memory_size=5,
|
|
||||||
attn_type='original',
|
attn_type='original',
|
||||||
attn_win=False,
|
attn_win=False,
|
||||||
gst=False,
|
|
||||||
attn_norm="sigmoid",
|
attn_norm="sigmoid",
|
||||||
prenet_type="original",
|
prenet_type="original",
|
||||||
prenet_dropout=True,
|
prenet_dropout=True,
|
||||||
|
@ -27,38 +25,41 @@ class Tacotron(nn.Module):
|
||||||
location_attn=True,
|
location_attn=True,
|
||||||
attn_K=5,
|
attn_K=5,
|
||||||
separate_stopnet=True,
|
separate_stopnet=True,
|
||||||
bidirectional_decoder=False):
|
bidirectional_decoder=False,
|
||||||
super(Tacotron, self).__init__()
|
double_decoder_consistency=False,
|
||||||
self.r = r
|
ddc_r=None,
|
||||||
self.decoder_output_dim = decoder_output_dim
|
gst=False,
|
||||||
self.postnet_output_dim = postnet_output_dim
|
memory_size=5):
|
||||||
self.gst = gst
|
super(Tacotron,
|
||||||
self.num_speakers = num_speakers
|
self).__init__(num_chars, num_speakers, r, postnet_output_dim,
|
||||||
self.bidirectional_decoder = bidirectional_decoder
|
decoder_output_dim, attn_type, attn_win,
|
||||||
decoder_dim = 512 if num_speakers > 1 else 256
|
|
||||||
encoder_dim = 512 if num_speakers > 1 else 256
|
|
||||||
proj_speaker_dim = 80 if num_speakers > 1 else 0
|
|
||||||
# embedding layer
|
|
||||||
self.embedding = nn.Embedding(num_chars, 256, padding_idx=0)
|
|
||||||
self.embedding.weight.data.normal_(0, 0.3)
|
|
||||||
# boilerplate model
|
|
||||||
self.encoder = Encoder(encoder_dim)
|
|
||||||
self.decoder = Decoder(decoder_dim, decoder_output_dim, r, memory_size, attn_type, attn_win,
|
|
||||||
attn_norm, prenet_type, prenet_dropout,
|
attn_norm, prenet_type, prenet_dropout,
|
||||||
forward_attn, trans_agent, forward_attn_mask,
|
forward_attn, trans_agent, forward_attn_mask,
|
||||||
location_attn, attn_K, separate_stopnet,
|
location_attn, attn_K, separate_stopnet,
|
||||||
proj_speaker_dim)
|
bidirectional_decoder, double_decoder_consistency,
|
||||||
if self.bidirectional_decoder:
|
ddc_r, gst)
|
||||||
self.decoder_backward = copy.deepcopy(self.decoder)
|
decoder_in_features = 512 if num_speakers > 1 else 256
|
||||||
|
encoder_in_features = 512 if num_speakers > 1 else 256
|
||||||
|
speaker_embedding_dim = 256
|
||||||
|
proj_speaker_dim = 80 if num_speakers > 1 else 0
|
||||||
|
# base model layers
|
||||||
|
self.embedding = nn.Embedding(num_chars, 256, padding_idx=0)
|
||||||
|
self.embedding.weight.data.normal_(0, 0.3)
|
||||||
|
self.encoder = Encoder(encoder_in_features)
|
||||||
|
self.decoder = Decoder(decoder_in_features, decoder_output_dim, r,
|
||||||
|
memory_size, attn_type, attn_win, attn_norm,
|
||||||
|
prenet_type, prenet_dropout, forward_attn,
|
||||||
|
trans_agent, forward_attn_mask, location_attn,
|
||||||
|
attn_K, separate_stopnet, proj_speaker_dim)
|
||||||
self.postnet = PostCBHG(decoder_output_dim)
|
self.postnet = PostCBHG(decoder_output_dim)
|
||||||
self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2,
|
self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2,
|
||||||
postnet_output_dim)
|
postnet_output_dim)
|
||||||
# speaker embedding layers
|
# speaker embedding layers
|
||||||
if num_speakers > 1:
|
if num_speakers > 1:
|
||||||
self.speaker_embedding = nn.Embedding(num_speakers, 256)
|
self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim)
|
||||||
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
||||||
self.speaker_project_mel = nn.Sequential(
|
self.speaker_project_mel = nn.Sequential(
|
||||||
nn.Linear(256, proj_speaker_dim), nn.Tanh())
|
nn.Linear(speaker_embedding_dim, proj_speaker_dim), nn.Tanh())
|
||||||
self.speaker_embeddings = None
|
self.speaker_embeddings = None
|
||||||
self.speaker_embeddings_projected = None
|
self.speaker_embeddings_projected = None
|
||||||
# global style token layers
|
# global style token layers
|
||||||
|
@ -68,28 +69,19 @@ class Tacotron(nn.Module):
|
||||||
num_heads=4,
|
num_heads=4,
|
||||||
num_style_tokens=10,
|
num_style_tokens=10,
|
||||||
embedding_dim=gst_embedding_dim)
|
embedding_dim=gst_embedding_dim)
|
||||||
|
# backward pass decoder
|
||||||
|
if self.bidirectional_decoder:
|
||||||
|
self._init_backward_decoder()
|
||||||
|
# setup DDC
|
||||||
|
if self.double_decoder_consistency:
|
||||||
|
self.coarse_decoder = Decoder(
|
||||||
|
decoder_in_features, decoder_output_dim, ddc_r, memory_size,
|
||||||
|
attn_type, attn_win, attn_norm, prenet_type, prenet_dropout,
|
||||||
|
forward_attn, trans_agent, forward_attn_mask, location_attn,
|
||||||
|
attn_K, separate_stopnet, proj_speaker_dim)
|
||||||
|
|
||||||
def _init_states(self):
|
|
||||||
self.speaker_embeddings = None
|
|
||||||
self.speaker_embeddings_projected = None
|
|
||||||
|
|
||||||
def compute_speaker_embedding(self, speaker_ids):
|
def forward(self, characters, text_lengths, mel_specs, mel_lengths=None, speaker_ids=None):
|
||||||
if hasattr(self, "speaker_embedding") and speaker_ids is None:
|
|
||||||
raise RuntimeError(
|
|
||||||
" [!] Model has speaker embedding layer but speaker_id is not provided"
|
|
||||||
)
|
|
||||||
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
|
|
||||||
self.speaker_embeddings = self._compute_speaker_embedding(
|
|
||||||
speaker_ids)
|
|
||||||
self.speaker_embeddings_projected = self.speaker_project_mel(
|
|
||||||
self.speaker_embeddings).squeeze(1)
|
|
||||||
|
|
||||||
def compute_gst(self, inputs, mel_specs):
|
|
||||||
gst_outputs = self.gst_layer(mel_specs)
|
|
||||||
inputs = self._add_speaker_embedding(inputs, gst_outputs)
|
|
||||||
return inputs
|
|
||||||
|
|
||||||
def forward(self, characters, text_lengths, mel_specs, speaker_ids=None):
|
|
||||||
"""
|
"""
|
||||||
Shapes:
|
Shapes:
|
||||||
- characters: B x T_in
|
- characters: B x T_in
|
||||||
|
@ -98,37 +90,50 @@ class Tacotron(nn.Module):
|
||||||
- speaker_ids: B x 1
|
- speaker_ids: B x 1
|
||||||
"""
|
"""
|
||||||
self._init_states()
|
self._init_states()
|
||||||
mask = sequence_mask(text_lengths).to(characters.device)
|
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
|
||||||
# B x T_in x embed_dim
|
# B x T_in x embed_dim
|
||||||
inputs = self.embedding(characters)
|
inputs = self.embedding(characters)
|
||||||
# B x speaker_embed_dim
|
# B x speaker_embed_dim
|
||||||
|
if speaker_ids is not None:
|
||||||
self.compute_speaker_embedding(speaker_ids)
|
self.compute_speaker_embedding(speaker_ids)
|
||||||
if self.num_speakers > 1:
|
if self.num_speakers > 1:
|
||||||
# B x T_in x embed_dim + speaker_embed_dim
|
# B x T_in x embed_dim + speaker_embed_dim
|
||||||
inputs = self._concat_speaker_embedding(inputs,
|
inputs = self._concat_speaker_embedding(inputs,
|
||||||
self.speaker_embeddings)
|
self.speaker_embeddings)
|
||||||
# B x T_in x encoder_dim
|
# B x T_in x encoder_in_features
|
||||||
encoder_outputs = self.encoder(inputs)
|
encoder_outputs = self.encoder(inputs)
|
||||||
|
# sequence masking
|
||||||
|
encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
|
||||||
|
# global style token
|
||||||
if self.gst:
|
if self.gst:
|
||||||
# B x gst_dim
|
# B x gst_dim
|
||||||
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
|
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
|
||||||
if self.num_speakers > 1:
|
if self.num_speakers > 1:
|
||||||
encoder_outputs = self._concat_speaker_embedding(
|
encoder_outputs = self._concat_speaker_embedding(
|
||||||
encoder_outputs, self.speaker_embeddings)
|
encoder_outputs, self.speaker_embeddings)
|
||||||
# decoder_outputs: B x decoder_dim x T_out
|
# decoder_outputs: B x decoder_in_features x T_out
|
||||||
# alignments: B x T_in x encoder_dim
|
# alignments: B x T_in x encoder_in_features
|
||||||
# stop_tokens: B x T_in
|
# stop_tokens: B x T_in
|
||||||
decoder_outputs, alignments, stop_tokens = self.decoder(
|
decoder_outputs, alignments, stop_tokens = self.decoder(
|
||||||
encoder_outputs, mel_specs, mask,
|
encoder_outputs, mel_specs, input_mask,
|
||||||
self.speaker_embeddings_projected)
|
self.speaker_embeddings_projected)
|
||||||
# B x T_out x decoder_dim
|
# sequence masking
|
||||||
|
if output_mask is not None:
|
||||||
|
decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs)
|
||||||
|
# B x T_out x decoder_in_features
|
||||||
postnet_outputs = self.postnet(decoder_outputs)
|
postnet_outputs = self.postnet(decoder_outputs)
|
||||||
|
# sequence masking
|
||||||
|
if output_mask is not None:
|
||||||
|
postnet_outputs = postnet_outputs * output_mask.unsqueeze(2).expand_as(postnet_outputs)
|
||||||
# B x T_out x posnet_dim
|
# B x T_out x posnet_dim
|
||||||
postnet_outputs = self.last_linear(postnet_outputs)
|
postnet_outputs = self.last_linear(postnet_outputs)
|
||||||
# B x T_out x decoder_dim
|
# B x T_out x decoder_in_features
|
||||||
decoder_outputs = decoder_outputs.transpose(1, 2).contiguous()
|
decoder_outputs = decoder_outputs.transpose(1, 2).contiguous()
|
||||||
if self.bidirectional_decoder:
|
if self.bidirectional_decoder:
|
||||||
decoder_outputs_backward, alignments_backward = self._backward_inference(mel_specs, encoder_outputs, mask)
|
decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask)
|
||||||
|
return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
|
||||||
|
if self.double_decoder_consistency:
|
||||||
|
decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass(mel_specs, encoder_outputs, alignments, input_mask)
|
||||||
return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
|
return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
|
||||||
return decoder_outputs, postnet_outputs, alignments, stop_tokens
|
return decoder_outputs, postnet_outputs, alignments, stop_tokens
|
||||||
|
|
||||||
|
@ -136,6 +141,7 @@ class Tacotron(nn.Module):
|
||||||
def inference(self, characters, speaker_ids=None, style_mel=None):
|
def inference(self, characters, speaker_ids=None, style_mel=None):
|
||||||
inputs = self.embedding(characters)
|
inputs = self.embedding(characters)
|
||||||
self._init_states()
|
self._init_states()
|
||||||
|
if speaker_ids is not None:
|
||||||
self.compute_speaker_embedding(speaker_ids)
|
self.compute_speaker_embedding(speaker_ids)
|
||||||
if self.num_speakers > 1:
|
if self.num_speakers > 1:
|
||||||
inputs = self._concat_speaker_embedding(inputs,
|
inputs = self._concat_speaker_embedding(inputs,
|
||||||
|
@ -152,28 +158,3 @@ class Tacotron(nn.Module):
|
||||||
postnet_outputs = self.last_linear(postnet_outputs)
|
postnet_outputs = self.last_linear(postnet_outputs)
|
||||||
decoder_outputs = decoder_outputs.transpose(1, 2)
|
decoder_outputs = decoder_outputs.transpose(1, 2)
|
||||||
return decoder_outputs, postnet_outputs, alignments, stop_tokens
|
return decoder_outputs, postnet_outputs, alignments, stop_tokens
|
||||||
|
|
||||||
def _backward_inference(self, mel_specs, encoder_outputs, mask):
|
|
||||||
decoder_outputs_b, alignments_b, _ = self.decoder_backward(
|
|
||||||
encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask,
|
|
||||||
self.speaker_embeddings_projected)
|
|
||||||
decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous()
|
|
||||||
return decoder_outputs_b, alignments_b
|
|
||||||
|
|
||||||
def _compute_speaker_embedding(self, speaker_ids):
|
|
||||||
speaker_embeddings = self.speaker_embedding(speaker_ids)
|
|
||||||
return speaker_embeddings.unsqueeze_(1)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _add_speaker_embedding(outputs, speaker_embeddings):
|
|
||||||
speaker_embeddings_ = speaker_embeddings.expand(
|
|
||||||
outputs.size(0), outputs.size(1), -1)
|
|
||||||
outputs = outputs + speaker_embeddings_
|
|
||||||
return outputs
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _concat_speaker_embedding(outputs, speaker_embeddings):
|
|
||||||
speaker_embeddings_ = speaker_embeddings.expand(
|
|
||||||
outputs.size(0), outputs.size(1), -1)
|
|
||||||
outputs = torch.cat([outputs, speaker_embeddings_], dim=-1)
|
|
||||||
return outputs
|
|
||||||
|
|
|
@ -1,13 +1,15 @@
|
||||||
import copy
|
|
||||||
import torch
|
|
||||||
from math import sqrt
|
from math import sqrt
|
||||||
|
|
||||||
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from TTS.layers.tacotron2 import Encoder, Decoder, Postnet
|
|
||||||
from TTS.utils.generic_utils import sequence_mask
|
from TTS.layers.gst_layers import GST
|
||||||
|
from TTS.layers.tacotron2 import Decoder, Encoder, Postnet
|
||||||
|
from TTS.models.tacotron_abstract import TacotronAbstract
|
||||||
|
|
||||||
|
|
||||||
# TODO: match function arguments with tacotron
|
# TODO: match function arguments with tacotron
|
||||||
class Tacotron2(nn.Module):
|
class Tacotron2(TacotronAbstract):
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
num_chars,
|
num_chars,
|
||||||
num_speakers,
|
num_speakers,
|
||||||
|
@ -25,37 +27,48 @@ class Tacotron2(nn.Module):
|
||||||
location_attn=True,
|
location_attn=True,
|
||||||
attn_K=5,
|
attn_K=5,
|
||||||
separate_stopnet=True,
|
separate_stopnet=True,
|
||||||
bidirectional_decoder=False):
|
bidirectional_decoder=False,
|
||||||
super(Tacotron2, self).__init__()
|
double_decoder_consistency=False,
|
||||||
self.postnet_output_dim = postnet_output_dim
|
ddc_r=None,
|
||||||
self.decoder_output_dim = decoder_output_dim
|
gst=False):
|
||||||
self.r = r
|
super(Tacotron2,
|
||||||
self.bidirectional_decoder = bidirectional_decoder
|
self).__init__(num_chars, num_speakers, r, postnet_output_dim,
|
||||||
decoder_dim = 512 if num_speakers > 1 else 512
|
decoder_output_dim, attn_type, attn_win,
|
||||||
encoder_dim = 512 if num_speakers > 1 else 512
|
attn_norm, prenet_type, prenet_dropout,
|
||||||
|
forward_attn, trans_agent, forward_attn_mask,
|
||||||
|
location_attn, attn_K, separate_stopnet,
|
||||||
|
bidirectional_decoder, double_decoder_consistency,
|
||||||
|
ddc_r, gst)
|
||||||
|
decoder_in_features = 512 if num_speakers > 1 else 512
|
||||||
|
encoder_in_features = 512 if num_speakers > 1 else 512
|
||||||
proj_speaker_dim = 80 if num_speakers > 1 else 0
|
proj_speaker_dim = 80 if num_speakers > 1 else 0
|
||||||
# embedding layer
|
# base layers
|
||||||
self.embedding = nn.Embedding(num_chars, 512, padding_idx=0)
|
self.embedding = nn.Embedding(num_chars, 512, padding_idx=0)
|
||||||
std = sqrt(2.0 / (num_chars + 512))
|
|
||||||
val = sqrt(3.0) * std # uniform bounds for std
|
|
||||||
self.embedding.weight.data.uniform_(-val, val)
|
|
||||||
if num_speakers > 1:
|
if num_speakers > 1:
|
||||||
self.speaker_embedding = nn.Embedding(num_speakers, 512)
|
self.speaker_embedding = nn.Embedding(num_speakers, 512)
|
||||||
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
||||||
self.speaker_embeddings = None
|
self.encoder = Encoder(encoder_in_features)
|
||||||
self.speaker_embeddings_projected = None
|
self.decoder = Decoder(decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win,
|
||||||
self.encoder = Encoder(encoder_dim)
|
|
||||||
self.decoder = Decoder(decoder_dim, self.decoder_output_dim, r, attn_type, attn_win,
|
|
||||||
attn_norm, prenet_type, prenet_dropout,
|
attn_norm, prenet_type, prenet_dropout,
|
||||||
forward_attn, trans_agent, forward_attn_mask,
|
forward_attn, trans_agent, forward_attn_mask,
|
||||||
location_attn, attn_K, separate_stopnet, proj_speaker_dim)
|
location_attn, attn_K, separate_stopnet, proj_speaker_dim)
|
||||||
if self.bidirectional_decoder:
|
|
||||||
self.decoder_backward = copy.deepcopy(self.decoder)
|
|
||||||
self.postnet = Postnet(self.postnet_output_dim)
|
self.postnet = Postnet(self.postnet_output_dim)
|
||||||
|
# global style token layers
|
||||||
def _init_states(self):
|
if self.gst:
|
||||||
self.speaker_embeddings = None
|
gst_embedding_dim = encoder_in_features
|
||||||
self.speaker_embeddings_projected = None
|
self.gst_layer = GST(num_mel=80,
|
||||||
|
num_heads=4,
|
||||||
|
num_style_tokens=10,
|
||||||
|
embedding_dim=gst_embedding_dim)
|
||||||
|
# backward pass decoder
|
||||||
|
if self.bidirectional_decoder:
|
||||||
|
self._init_backward_decoder()
|
||||||
|
# setup DDC
|
||||||
|
if self.double_decoder_consistency:
|
||||||
|
self.coarse_decoder = Decoder(decoder_in_features, self.decoder_output_dim, ddc_r, attn_type, attn_win,
|
||||||
|
attn_norm, prenet_type, prenet_dropout,
|
||||||
|
forward_attn, trans_agent, forward_attn_mask,
|
||||||
|
location_attn, attn_K, separate_stopnet, proj_speaker_dim)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def shape_outputs(mel_outputs, mel_outputs_postnet, alignments):
|
def shape_outputs(mel_outputs, mel_outputs_postnet, alignments):
|
||||||
|
@ -63,22 +76,49 @@ class Tacotron2(nn.Module):
|
||||||
mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2)
|
mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2)
|
||||||
return mel_outputs, mel_outputs_postnet, alignments
|
return mel_outputs, mel_outputs_postnet, alignments
|
||||||
|
|
||||||
def forward(self, text, text_lengths, mel_specs=None, speaker_ids=None):
|
def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None):
|
||||||
self._init_states()
|
self._init_states()
|
||||||
# compute mask for padding
|
# compute mask for padding
|
||||||
mask = sequence_mask(text_lengths).to(text.device)
|
# B x T_in_max (boolean)
|
||||||
|
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
|
||||||
|
# B x D_embed x T_in_max
|
||||||
embedded_inputs = self.embedding(text).transpose(1, 2)
|
embedded_inputs = self.embedding(text).transpose(1, 2)
|
||||||
|
# B x T_in_max x D_en
|
||||||
encoder_outputs = self.encoder(embedded_inputs, text_lengths)
|
encoder_outputs = self.encoder(embedded_inputs, text_lengths)
|
||||||
|
# adding speaker embeddding to encoder output
|
||||||
|
# TODO: multi-speaker
|
||||||
|
# B x speaker_embed_dim
|
||||||
|
if speaker_ids is not None:
|
||||||
|
self.compute_speaker_embedding(speaker_ids)
|
||||||
|
if self.num_speakers > 1:
|
||||||
|
# B x T_in x embed_dim + speaker_embed_dim
|
||||||
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
|
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
|
||||||
speaker_ids)
|
self.speaker_embeddings)
|
||||||
|
encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
|
||||||
|
# global style token
|
||||||
|
if self.gst:
|
||||||
|
# B x gst_dim
|
||||||
|
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
|
||||||
|
# B x mel_dim x T_out -- B x T_out//r x T_in -- B x T_out//r
|
||||||
decoder_outputs, alignments, stop_tokens = self.decoder(
|
decoder_outputs, alignments, stop_tokens = self.decoder(
|
||||||
encoder_outputs, mel_specs, mask)
|
encoder_outputs, mel_specs, input_mask)
|
||||||
|
# sequence masking
|
||||||
|
if mel_lengths is not None:
|
||||||
|
decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs)
|
||||||
|
# B x mel_dim x T_out
|
||||||
postnet_outputs = self.postnet(decoder_outputs)
|
postnet_outputs = self.postnet(decoder_outputs)
|
||||||
postnet_outputs = decoder_outputs + postnet_outputs
|
postnet_outputs = decoder_outputs + postnet_outputs
|
||||||
|
# sequence masking
|
||||||
|
if output_mask is not None:
|
||||||
|
postnet_outputs = postnet_outputs * output_mask.unsqueeze(1).expand_as(postnet_outputs)
|
||||||
|
# B x T_out x mel_dim -- B x T_out x mel_dim -- B x T_out//r x T_in
|
||||||
decoder_outputs, postnet_outputs, alignments = self.shape_outputs(
|
decoder_outputs, postnet_outputs, alignments = self.shape_outputs(
|
||||||
decoder_outputs, postnet_outputs, alignments)
|
decoder_outputs, postnet_outputs, alignments)
|
||||||
if self.bidirectional_decoder:
|
if self.bidirectional_decoder:
|
||||||
decoder_outputs_backward, alignments_backward = self._backward_inference(mel_specs, encoder_outputs, mask)
|
decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask)
|
||||||
|
return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
|
||||||
|
if self.double_decoder_consistency:
|
||||||
|
decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass(mel_specs, encoder_outputs, alignments, input_mask)
|
||||||
return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
|
return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward
|
||||||
return decoder_outputs, postnet_outputs, alignments, stop_tokens
|
return decoder_outputs, postnet_outputs, alignments, stop_tokens
|
||||||
|
|
||||||
|
@ -86,15 +126,18 @@ class Tacotron2(nn.Module):
|
||||||
def inference(self, text, speaker_ids=None):
|
def inference(self, text, speaker_ids=None):
|
||||||
embedded_inputs = self.embedding(text).transpose(1, 2)
|
embedded_inputs = self.embedding(text).transpose(1, 2)
|
||||||
encoder_outputs = self.encoder.inference(embedded_inputs)
|
encoder_outputs = self.encoder.inference(embedded_inputs)
|
||||||
|
if speaker_ids is not None:
|
||||||
|
self.compute_speaker_embedding(speaker_ids)
|
||||||
|
if self.num_speakers > 1:
|
||||||
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
|
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
|
||||||
speaker_ids)
|
self.speaker_embeddings)
|
||||||
mel_outputs, alignments, stop_tokens = self.decoder.inference(
|
decoder_outputs, alignments, stop_tokens = self.decoder.inference(
|
||||||
encoder_outputs)
|
encoder_outputs)
|
||||||
mel_outputs_postnet = self.postnet(mel_outputs)
|
postnet_outputs = self.postnet(decoder_outputs)
|
||||||
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
|
postnet_outputs = decoder_outputs + postnet_outputs
|
||||||
mel_outputs, mel_outputs_postnet, alignments = self.shape_outputs(
|
decoder_outputs, postnet_outputs, alignments = self.shape_outputs(
|
||||||
mel_outputs, mel_outputs_postnet, alignments)
|
decoder_outputs, postnet_outputs, alignments)
|
||||||
return mel_outputs, mel_outputs_postnet, alignments, stop_tokens
|
return decoder_outputs, postnet_outputs, alignments, stop_tokens
|
||||||
|
|
||||||
def inference_truncated(self, text, speaker_ids=None):
|
def inference_truncated(self, text, speaker_ids=None):
|
||||||
"""
|
"""
|
||||||
|
@ -112,22 +155,16 @@ class Tacotron2(nn.Module):
|
||||||
mel_outputs, mel_outputs_postnet, alignments)
|
mel_outputs, mel_outputs_postnet, alignments)
|
||||||
return mel_outputs, mel_outputs_postnet, alignments, stop_tokens
|
return mel_outputs, mel_outputs_postnet, alignments, stop_tokens
|
||||||
|
|
||||||
def _backward_inference(self, mel_specs, encoder_outputs, mask):
|
|
||||||
decoder_outputs_b, alignments_b, _ = self.decoder_backward(
|
|
||||||
encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask,
|
|
||||||
self.speaker_embeddings_projected)
|
|
||||||
decoder_outputs_b = decoder_outputs_b.transpose(1, 2)
|
|
||||||
return decoder_outputs_b, alignments_b
|
|
||||||
|
|
||||||
def _add_speaker_embedding(self, encoder_outputs, speaker_ids):
|
def _speaker_embedding_pass(self, encoder_outputs, speaker_ids):
|
||||||
if hasattr(self, "speaker_embedding") and speaker_ids is None:
|
# TODO: multi-speaker
|
||||||
raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided")
|
# if hasattr(self, "speaker_embedding") and speaker_ids is None:
|
||||||
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
|
# raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided")
|
||||||
speaker_embeddings = self.speaker_embedding(speaker_ids)
|
# if hasattr(self, "speaker_embedding") and speaker_ids is not None:
|
||||||
|
|
||||||
speaker_embeddings.unsqueeze_(1)
|
# speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0),
|
||||||
speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0),
|
# encoder_outputs.size(1),
|
||||||
encoder_outputs.size(1),
|
# -1)
|
||||||
-1)
|
# encoder_outputs = encoder_outputs + speaker_embeddings
|
||||||
encoder_outputs = encoder_outputs + speaker_embeddings
|
# return encoder_outputs
|
||||||
return encoder_outputs
|
pass
|
||||||
|
|
|
@ -0,0 +1,180 @@
|
||||||
|
import copy
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
from TTS.utils.generic_utils import sequence_mask
|
||||||
|
|
||||||
|
|
||||||
|
class TacotronAbstract(ABC, nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
num_chars,
|
||||||
|
num_speakers,
|
||||||
|
r,
|
||||||
|
postnet_output_dim=80,
|
||||||
|
decoder_output_dim=80,
|
||||||
|
attn_type='original',
|
||||||
|
attn_win=False,
|
||||||
|
attn_norm="softmax",
|
||||||
|
prenet_type="original",
|
||||||
|
prenet_dropout=True,
|
||||||
|
forward_attn=False,
|
||||||
|
trans_agent=False,
|
||||||
|
forward_attn_mask=False,
|
||||||
|
location_attn=True,
|
||||||
|
attn_K=5,
|
||||||
|
separate_stopnet=True,
|
||||||
|
bidirectional_decoder=False,
|
||||||
|
double_decoder_consistency=False,
|
||||||
|
ddc_r=None,
|
||||||
|
gst=False):
|
||||||
|
""" Abstract Tacotron class """
|
||||||
|
super().__init__()
|
||||||
|
self.num_chars = num_chars
|
||||||
|
self.r = r
|
||||||
|
self.decoder_output_dim = decoder_output_dim
|
||||||
|
self.postnet_output_dim = postnet_output_dim
|
||||||
|
self.gst = gst
|
||||||
|
self.num_speakers = num_speakers
|
||||||
|
self.bidirectional_decoder = bidirectional_decoder
|
||||||
|
self.double_decoder_consistency = double_decoder_consistency
|
||||||
|
self.ddc_r = ddc_r
|
||||||
|
self.attn_type = attn_type
|
||||||
|
self.attn_win = attn_win
|
||||||
|
self.attn_norm = attn_norm
|
||||||
|
self.prenet_type = prenet_type
|
||||||
|
self.prenet_dropout = prenet_dropout
|
||||||
|
self.forward_attn = forward_attn
|
||||||
|
self.trans_agent = trans_agent
|
||||||
|
self.forward_attn_mask = forward_attn_mask
|
||||||
|
self.location_attn = location_attn
|
||||||
|
self.attn_K = attn_K
|
||||||
|
self.separate_stopnet = separate_stopnet
|
||||||
|
|
||||||
|
# layers
|
||||||
|
self.embedding = None
|
||||||
|
self.encoder = None
|
||||||
|
self.decoder = None
|
||||||
|
self.postnet = None
|
||||||
|
|
||||||
|
# global style token
|
||||||
|
if self.gst:
|
||||||
|
self.gst_layer = None
|
||||||
|
|
||||||
|
# model states
|
||||||
|
self.speaker_embeddings = None
|
||||||
|
self.speaker_embeddings_projected = None
|
||||||
|
|
||||||
|
# additional layers
|
||||||
|
self.decoder_backward = None
|
||||||
|
self.coarse_decoder = None
|
||||||
|
|
||||||
|
#############################
|
||||||
|
# INIT FUNCTIONS
|
||||||
|
#############################
|
||||||
|
|
||||||
|
def _init_states(self):
|
||||||
|
self.speaker_embeddings = None
|
||||||
|
self.speaker_embeddings_projected = None
|
||||||
|
|
||||||
|
def _init_backward_decoder(self):
|
||||||
|
self.decoder_backward = copy.deepcopy(self.decoder)
|
||||||
|
|
||||||
|
def _init_coarse_decoder(self):
|
||||||
|
self.coarse_decoder = copy.deepcopy(self.decoder)
|
||||||
|
self.coarse_decoder.r_init = self.ddc_r
|
||||||
|
self.coarse_decoder.set_r(self.ddc_r)
|
||||||
|
|
||||||
|
#############################
|
||||||
|
# CORE FUNCTIONS
|
||||||
|
#############################
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def forward(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def inference(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
#############################
|
||||||
|
# COMMON COMPUTE FUNCTIONS
|
||||||
|
#############################
|
||||||
|
|
||||||
|
def compute_masks(self, text_lengths, mel_lengths):
|
||||||
|
"""Compute masks against sequence paddings."""
|
||||||
|
# B x T_in_max (boolean)
|
||||||
|
device = text_lengths.device
|
||||||
|
input_mask = sequence_mask(text_lengths).to(device)
|
||||||
|
output_mask = None
|
||||||
|
if mel_lengths is not None:
|
||||||
|
max_len = mel_lengths.max()
|
||||||
|
r = self.decoder.r
|
||||||
|
max_len = max_len + (r - (max_len % r)) if max_len % r > 0 else max_len
|
||||||
|
output_mask = sequence_mask(mel_lengths, max_len=max_len).to(device)
|
||||||
|
return input_mask, output_mask
|
||||||
|
|
||||||
|
def _backward_pass(self, mel_specs, encoder_outputs, mask):
|
||||||
|
""" Run backwards decoder """
|
||||||
|
decoder_outputs_b, alignments_b, _ = self.decoder_backward(
|
||||||
|
encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask,
|
||||||
|
self.speaker_embeddings_projected)
|
||||||
|
decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous()
|
||||||
|
return decoder_outputs_b, alignments_b
|
||||||
|
|
||||||
|
def _coarse_decoder_pass(self, mel_specs, encoder_outputs, alignments,
|
||||||
|
input_mask):
|
||||||
|
""" Double Decoder Consistency """
|
||||||
|
T = mel_specs.shape[1]
|
||||||
|
if T % self.coarse_decoder.r > 0:
|
||||||
|
padding_size = self.coarse_decoder.r - (T % self.coarse_decoder.r)
|
||||||
|
mel_specs = torch.nn.functional.pad(mel_specs,
|
||||||
|
(0, 0, 0, padding_size, 0, 0))
|
||||||
|
decoder_outputs_backward, alignments_backward, _ = self.coarse_decoder(
|
||||||
|
encoder_outputs.detach(), mel_specs, input_mask)
|
||||||
|
# scale_factor = self.decoder.r_init / self.decoder.r
|
||||||
|
alignments_backward = torch.nn.functional.interpolate(
|
||||||
|
alignments_backward.transpose(1, 2),
|
||||||
|
size=alignments.shape[1],
|
||||||
|
mode='nearest').transpose(1, 2)
|
||||||
|
decoder_outputs_backward = decoder_outputs_backward.transpose(1, 2)
|
||||||
|
decoder_outputs_backward = decoder_outputs_backward[:, :T, :]
|
||||||
|
return decoder_outputs_backward, alignments_backward
|
||||||
|
|
||||||
|
#############################
|
||||||
|
# EMBEDDING FUNCTIONS
|
||||||
|
#############################
|
||||||
|
|
||||||
|
def compute_speaker_embedding(self, speaker_ids):
|
||||||
|
""" Compute speaker embedding vectors """
|
||||||
|
if hasattr(self, "speaker_embedding") and speaker_ids is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
" [!] Model has speaker embedding layer but speaker_id is not provided"
|
||||||
|
)
|
||||||
|
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
|
||||||
|
self.speaker_embeddings = self.speaker_embedding(speaker_ids).unsqueeze(1)
|
||||||
|
if hasattr(self, "speaker_project_mel") and speaker_ids is not None:
|
||||||
|
self.speaker_embeddings_projected = self.speaker_project_mel(
|
||||||
|
self.speaker_embeddings).squeeze(1)
|
||||||
|
|
||||||
|
def compute_gst(self, inputs, mel_specs):
|
||||||
|
""" Compute global style token """
|
||||||
|
# pylint: disable=not-callable
|
||||||
|
gst_outputs = self.gst_layer(mel_specs)
|
||||||
|
inputs = self._add_speaker_embedding(inputs, gst_outputs)
|
||||||
|
return inputs
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _add_speaker_embedding(outputs, speaker_embeddings):
|
||||||
|
speaker_embeddings_ = speaker_embeddings.expand(
|
||||||
|
outputs.size(0), outputs.size(1), -1)
|
||||||
|
outputs = outputs + speaker_embeddings_
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _concat_speaker_embedding(outputs, speaker_embeddings):
|
||||||
|
speaker_embeddings_ = speaker_embeddings.expand(
|
||||||
|
outputs.size(0), outputs.size(1), -1)
|
||||||
|
outputs = torch.cat([outputs, speaker_embeddings_], dim=-1)
|
||||||
|
return outputs
|
|
@ -1,5 +1,5 @@
|
||||||
numpy>=1.16.0
|
numpy>=1.16.0
|
||||||
torch>=0.4.1
|
torch>=1.5
|
||||||
librosa>=0.5.1
|
librosa>=0.5.1
|
||||||
Unidecode>=0.4.20
|
Unidecode>=0.4.20
|
||||||
tensorboard
|
tensorboard
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
numpy>=1.16.0
|
numpy>=1.16.0
|
||||||
|
numba==0.48
|
||||||
torch>=0.4.1
|
torch>=0.4.1
|
||||||
tensorflow>=2.2
|
tensorflow>=2.2
|
||||||
librosa>=0.5.1
|
librosa>=0.5.1
|
||||||
|
@ -13,3 +14,4 @@ tqdm
|
||||||
soundfile
|
soundfile
|
||||||
phonemizer
|
phonemizer
|
||||||
bokeh==1.4.0
|
bokeh==1.4.0
|
||||||
|
nose
|
||||||
|
|
|
@ -21,6 +21,8 @@ def create_argparser():
|
||||||
parser.add_argument('--pwgan_lib_path', type=str, default=None, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
|
parser.add_argument('--pwgan_lib_path', type=str, default=None, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
|
||||||
parser.add_argument('--pwgan_file', type=str, default=None, help='path to ParallelWaveGAN checkpoint file.')
|
parser.add_argument('--pwgan_file', type=str, default=None, help='path to ParallelWaveGAN checkpoint file.')
|
||||||
parser.add_argument('--pwgan_config', type=str, default=None, help='path to ParallelWaveGAN config file.')
|
parser.add_argument('--pwgan_config', type=str, default=None, help='path to ParallelWaveGAN config file.')
|
||||||
|
parser.add_argument('--vocoder_config', type=str, default=None, help='path to TTS.vocoder config file.')
|
||||||
|
parser.add_argument('--vocoder_checkpoint', type=str, default=None, help='path to TTS.vocoder checkpoint file.')
|
||||||
parser.add_argument('--port', type=int, default=5002, help='port to listen on.')
|
parser.add_argument('--port', type=int, default=5002, help='port to listen on.')
|
||||||
parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.')
|
parser.add_argument('--use_cuda', type=convert_boolean, default=False, help='true to use CUDA.')
|
||||||
parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.')
|
parser.add_argument('--debug', type=convert_boolean, default=False, help='true to enable Flask debug mode.')
|
||||||
|
@ -35,6 +37,11 @@ embedded_tts_folder = os.path.join(embedded_models_folder, 'tts')
|
||||||
tts_checkpoint_file = os.path.join(embedded_tts_folder, 'checkpoint.pth.tar')
|
tts_checkpoint_file = os.path.join(embedded_tts_folder, 'checkpoint.pth.tar')
|
||||||
tts_config_file = os.path.join(embedded_tts_folder, 'config.json')
|
tts_config_file = os.path.join(embedded_tts_folder, 'config.json')
|
||||||
|
|
||||||
|
embedded_vocoder_folder = os.path.join(embedded_models_folder, 'vocoder')
|
||||||
|
vocoder_checkpoint_file = os.path.join(embedded_vocoder_folder, 'checkpoint.pth.tar')
|
||||||
|
vocoder_config_file = os.path.join(embedded_vocoder_folder, 'config.json')
|
||||||
|
|
||||||
|
# These models are soon to be deprecated
|
||||||
embedded_wavernn_folder = os.path.join(embedded_models_folder, 'wavernn')
|
embedded_wavernn_folder = os.path.join(embedded_models_folder, 'wavernn')
|
||||||
wavernn_checkpoint_file = os.path.join(embedded_wavernn_folder, 'checkpoint.pth.tar')
|
wavernn_checkpoint_file = os.path.join(embedded_wavernn_folder, 'checkpoint.pth.tar')
|
||||||
wavernn_config_file = os.path.join(embedded_wavernn_folder, 'config.json')
|
wavernn_config_file = os.path.join(embedded_wavernn_folder, 'config.json')
|
||||||
|
@ -50,6 +57,11 @@ if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file):
|
||||||
args.tts_checkpoint = tts_checkpoint_file
|
args.tts_checkpoint = tts_checkpoint_file
|
||||||
if not args.tts_config and os.path.isfile(tts_config_file):
|
if not args.tts_config and os.path.isfile(tts_config_file):
|
||||||
args.tts_config = tts_config_file
|
args.tts_config = tts_config_file
|
||||||
|
if not args.vocoder_checkpoint and os.path.isfile(tts_checkpoint_file):
|
||||||
|
args.tts_checkpoint = tts_checkpoint_file
|
||||||
|
if not args.vocoder_config and os.path.isfile(tts_config_file):
|
||||||
|
args.tts_config = tts_config_file
|
||||||
|
|
||||||
if not args.wavernn_file and os.path.isfile(wavernn_checkpoint_file):
|
if not args.wavernn_file and os.path.isfile(wavernn_checkpoint_file):
|
||||||
args.wavernn_file = wavernn_checkpoint_file
|
args.wavernn_file = wavernn_checkpoint_file
|
||||||
if not args.wavernn_config and os.path.isfile(wavernn_config_file):
|
if not args.wavernn_config and os.path.isfile(wavernn_config_file):
|
||||||
|
@ -76,5 +88,9 @@ def tts():
|
||||||
return send_file(data, mimetype='audio/wav')
|
return send_file(data, mimetype='audio/wav')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
def main():
|
||||||
app.run(debug=args.debug, host='0.0.0.0', port=args.port)
|
app.run(debug=args.debug, host='0.0.0.0', port=args.port)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import io
|
import io
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
@ -10,6 +11,7 @@ from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.utils.io import load_config
|
from TTS.utils.io import load_config
|
||||||
from TTS.utils.generic_utils import setup_model
|
from TTS.utils.generic_utils import setup_model
|
||||||
from TTS.utils.speakers import load_speaker_mapping
|
from TTS.utils.speakers import load_speaker_mapping
|
||||||
|
from TTS.vocoder.utils.generic_utils import setup_generator
|
||||||
# pylint: disable=unused-wildcard-import
|
# pylint: disable=unused-wildcard-import
|
||||||
# pylint: disable=wildcard-import
|
# pylint: disable=wildcard-import
|
||||||
from TTS.utils.synthesis import *
|
from TTS.utils.synthesis import *
|
||||||
|
@ -34,10 +36,12 @@ class Synthesizer(object):
|
||||||
assert torch.cuda.is_available(), "CUDA is not availabe on this machine."
|
assert torch.cuda.is_available(), "CUDA is not availabe on this machine."
|
||||||
self.load_tts(self.config.tts_checkpoint, self.config.tts_config,
|
self.load_tts(self.config.tts_checkpoint, self.config.tts_config,
|
||||||
self.config.use_cuda)
|
self.config.use_cuda)
|
||||||
|
if self.config.vocoder_checkpoint:
|
||||||
|
self.load_vocoder(self.config.vocoder_checkpoint, self.config.vocoder_config, self.config.use_cuda)
|
||||||
if self.config.wavernn_lib_path:
|
if self.config.wavernn_lib_path:
|
||||||
self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_file,
|
self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_file,
|
||||||
self.config.wavernn_config, self.config.use_cuda)
|
self.config.wavernn_config, self.config.use_cuda)
|
||||||
if self.config.pwgan_lib_path:
|
if self.config.pwgan_file:
|
||||||
self.load_pwgan(self.config.pwgan_lib_path, self.config.pwgan_file,
|
self.load_pwgan(self.config.pwgan_lib_path, self.config.pwgan_file,
|
||||||
self.config.pwgan_config, self.config.use_cuda)
|
self.config.pwgan_config, self.config.use_cuda)
|
||||||
|
|
||||||
|
@ -77,6 +81,19 @@ class Synthesizer(object):
|
||||||
self.tts_model.decoder.max_decoder_steps = 3000
|
self.tts_model.decoder.max_decoder_steps = 3000
|
||||||
if 'r' in cp:
|
if 'r' in cp:
|
||||||
self.tts_model.decoder.set_r(cp['r'])
|
self.tts_model.decoder.set_r(cp['r'])
|
||||||
|
print(f" > model reduction factor: {cp['r']}")
|
||||||
|
|
||||||
|
def load_vocoder(self, model_file, model_config, use_cuda):
|
||||||
|
self.vocoder_config = load_config(model_config)
|
||||||
|
self.vocoder_model = setup_generator(self.vocoder_config)
|
||||||
|
self.vocoder_model.load_state_dict(torch.load(model_file, map_location="cpu")["model"])
|
||||||
|
self.vocoder_model.remove_weight_norm()
|
||||||
|
self.vocoder_model.inference_padding = 0
|
||||||
|
self.vocoder_config = load_config(model_config)
|
||||||
|
|
||||||
|
if use_cuda:
|
||||||
|
self.vocoder_model.cuda()
|
||||||
|
self.vocoder_model.eval()
|
||||||
|
|
||||||
def load_wavernn(self, lib_path, model_file, model_config, use_cuda):
|
def load_wavernn(self, lib_path, model_file, model_config, use_cuda):
|
||||||
# TODO: set a function in wavernn code base for model setup and call it here.
|
# TODO: set a function in wavernn code base for model setup and call it here.
|
||||||
|
@ -113,9 +130,14 @@ class Synthesizer(object):
|
||||||
self.wavernn.eval()
|
self.wavernn.eval()
|
||||||
|
|
||||||
def load_pwgan(self, lib_path, model_file, model_config, use_cuda):
|
def load_pwgan(self, lib_path, model_file, model_config, use_cuda):
|
||||||
sys.path.append(lib_path) # set this if ParallelWaveGAN is not installed globally
|
if lib_path:
|
||||||
|
# set this if ParallelWaveGAN is not installed globally
|
||||||
|
sys.path.append(lib_path)
|
||||||
|
try:
|
||||||
#pylint: disable=import-outside-toplevel
|
#pylint: disable=import-outside-toplevel
|
||||||
from parallel_wavegan.models import ParallelWaveGANGenerator
|
from parallel_wavegan.models import ParallelWaveGANGenerator
|
||||||
|
except ImportError as e:
|
||||||
|
raise RuntimeError(f"cannot import parallel-wavegan, either install it or set its directory using the --pwgan_lib_path command line argument: {e}")
|
||||||
print(" > Loading PWGAN model ...")
|
print(" > Loading PWGAN model ...")
|
||||||
print(" | > model config: ", model_config)
|
print(" | > model config: ", model_config)
|
||||||
print(" | > model file: ", model_file)
|
print(" | > model file: ", model_file)
|
||||||
|
@ -166,6 +188,7 @@ class Synthesizer(object):
|
||||||
return sentences
|
return sentences
|
||||||
|
|
||||||
def tts(self, text, speaker_id=None):
|
def tts(self, text, speaker_id=None):
|
||||||
|
start_time = time.time()
|
||||||
wavs = []
|
wavs = []
|
||||||
sens = self.split_into_sentences(text)
|
sens = self.split_into_sentences(text)
|
||||||
print(sens)
|
print(sens)
|
||||||
|
@ -179,24 +202,25 @@ class Synthesizer(object):
|
||||||
inputs = numpy_to_torch(inputs, torch.long, cuda=self.use_cuda)
|
inputs = numpy_to_torch(inputs, torch.long, cuda=self.use_cuda)
|
||||||
inputs = inputs.unsqueeze(0)
|
inputs = inputs.unsqueeze(0)
|
||||||
# synthesize voice
|
# synthesize voice
|
||||||
decoder_output, postnet_output, alignments, stop_tokens = run_model_torch(
|
decoder_output, postnet_output, alignments, stop_tokens = run_model_torch(self.tts_model, inputs, self.tts_config, False, speaker_id, None)
|
||||||
self.tts_model, inputs, self.tts_config, False, speaker_id, None)
|
|
||||||
# convert outputs to numpy
|
# convert outputs to numpy
|
||||||
postnet_output, decoder_output, _, _ = parse_outputs_torch(
|
if self.vocoder_model:
|
||||||
postnet_output, decoder_output, alignments, stop_tokens)
|
vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0)
|
||||||
|
wav = self.vocoder_model.inference(vocoder_input)
|
||||||
if self.pwgan:
|
|
||||||
vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
|
|
||||||
if self.use_cuda:
|
if self.use_cuda:
|
||||||
vocoder_input.cuda()
|
wav = wav.cpu().numpy()
|
||||||
wav = self.pwgan.inference(vocoder_input, hop_size=self.ap.hop_length)
|
else:
|
||||||
|
wav = wav.numpy()
|
||||||
|
wav = wav.flatten()
|
||||||
elif self.wavernn:
|
elif self.wavernn:
|
||||||
vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
|
vocoder_input = None
|
||||||
|
if self.tts_config.model == "Tacotron":
|
||||||
|
vocoder_input = torch.FloatTensor(self.ap.out_linear_to_mel(linear_spec=postnet_output.T).T).T.unsqueeze(0)
|
||||||
|
else:
|
||||||
|
vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0)
|
||||||
if self.use_cuda:
|
if self.use_cuda:
|
||||||
vocoder_input.cuda()
|
vocoder_input.cuda()
|
||||||
wav = self.wavernn.generate(vocoder_input, batched=self.config.is_wavernn_batched, target=11000, overlap=550)
|
wav = self.wavernn.generate(vocoder_input, batched=self.config.is_wavernn_batched, target=11000, overlap=550)
|
||||||
else:
|
|
||||||
wav = inv_spectrogram(postnet_output, self.ap, self.tts_config)
|
|
||||||
# trim silence
|
# trim silence
|
||||||
wav = trim_silence(wav, self.ap)
|
wav = trim_silence(wav, self.ap)
|
||||||
|
|
||||||
|
@ -205,4 +229,10 @@ class Synthesizer(object):
|
||||||
|
|
||||||
out = io.BytesIO()
|
out = io.BytesIO()
|
||||||
self.save_wav(wavs, out)
|
self.save_wav(wavs, out)
|
||||||
|
|
||||||
|
# compute stats
|
||||||
|
process_time = time.time() - start_time
|
||||||
|
audio_time = len(wavs) / self.tts_config.audio['sample_rate']
|
||||||
|
print(f" > Processing time: {process_time}")
|
||||||
|
print(f" > Real-time factor: {process_time / audio_time}")
|
||||||
return out
|
return out
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
<meta name="description" content="">
|
<meta name="description" content="">
|
||||||
<meta name="author" content="">
|
<meta name="author" content="">
|
||||||
|
|
||||||
<title>Mozillia - Text2Speech engine</title>
|
<title>Mozilla - Text2Speech engine</title>
|
||||||
|
|
||||||
<!-- Bootstrap core CSS -->
|
<!-- Bootstrap core CSS -->
|
||||||
<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
|
<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css"
|
||||||
|
|
11
setup.py
11
setup.py
|
@ -19,7 +19,7 @@ args, unknown_args = parser.parse_known_args()
|
||||||
# Remove our arguments from argv so that setuptools doesn't see them
|
# Remove our arguments from argv so that setuptools doesn't see them
|
||||||
sys.argv = [sys.argv[0]] + unknown_args
|
sys.argv = [sys.argv[0]] + unknown_args
|
||||||
|
|
||||||
version = '0.0.1'
|
version = '0.0.3'
|
||||||
|
|
||||||
# Adapted from https://github.com/pytorch/pytorch
|
# Adapted from https://github.com/pytorch/pytorch
|
||||||
cwd = os.path.dirname(os.path.abspath(__file__))
|
cwd = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
@ -75,6 +75,11 @@ setup(
|
||||||
url='https://github.com/mozilla/TTS',
|
url='https://github.com/mozilla/TTS',
|
||||||
description='Text to Speech with Deep Learning',
|
description='Text to Speech with Deep Learning',
|
||||||
license='MPL-2.0',
|
license='MPL-2.0',
|
||||||
|
entry_points={
|
||||||
|
'console_scripts': [
|
||||||
|
'tts-server = TTS.server.server:main'
|
||||||
|
]
|
||||||
|
},
|
||||||
package_dir={'': 'tts_namespace'},
|
package_dir={'': 'tts_namespace'},
|
||||||
packages=find_packages('tts_namespace'),
|
packages=find_packages('tts_namespace'),
|
||||||
package_data={
|
package_data={
|
||||||
|
@ -92,8 +97,8 @@ setup(
|
||||||
},
|
},
|
||||||
install_requires=[
|
install_requires=[
|
||||||
"scipy>=0.19.0",
|
"scipy>=0.19.0",
|
||||||
"torch>=0.4.1",
|
"torch>=1.5",
|
||||||
"numpy==1.15.4",
|
"numpy>=1.16.0",
|
||||||
"librosa==0.6.2",
|
"librosa==0.6.2",
|
||||||
"unidecode==0.4.20",
|
"unidecode==0.4.20",
|
||||||
"attrdict",
|
"attrdict",
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
### Speaker embedding (Experimental)
|
### Speaker Encoder
|
||||||
|
|
||||||
This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
|
This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
umap-learn
|
||||||
|
numpy>=1.17.0
|
|
@ -13,12 +13,11 @@ from TTS.speaker_encoder.model import SpeakerEncoder
|
||||||
from TTS.speaker_encoder.visual import plot_embeddings
|
from TTS.speaker_encoder.visual import plot_embeddings
|
||||||
from TTS.speaker_encoder.generic_utils import save_best_model
|
from TTS.speaker_encoder.generic_utils import save_best_model
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.utils.generic_utils import (NoamLR, check_update, copy_config_file,
|
from TTS.utils.generic_utils import (create_experiment_folder, get_git_branch,
|
||||||
count_parameters,
|
|
||||||
create_experiment_folder, get_git_branch,
|
|
||||||
load_config,
|
|
||||||
remove_experiment_folder, set_init_dict)
|
remove_experiment_folder, set_init_dict)
|
||||||
from TTS.utils.logger import Logger
|
from TTS.utils.io import load_config, copy_config_file
|
||||||
|
from TTS.utils.training import check_update, NoamLR
|
||||||
|
from TTS.utils.tensorboard_logger import TensorboardLogger
|
||||||
from TTS.utils.radam import RAdam
|
from TTS.utils.radam import RAdam
|
||||||
|
|
||||||
torch.backends.cudnn.enabled = True
|
torch.backends.cudnn.enabled = True
|
||||||
|
@ -237,7 +236,7 @@ if __name__ == '__main__':
|
||||||
new_fields)
|
new_fields)
|
||||||
|
|
||||||
LOG_DIR = OUT_PATH
|
LOG_DIR = OUT_PATH
|
||||||
tb_logger = Logger(LOG_DIR)
|
tb_logger = TensorboardLogger(LOG_DIR)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
main(args)
|
main(args)
|
||||||
|
|
|
@ -7,7 +7,8 @@ import json
|
||||||
import string
|
import string
|
||||||
|
|
||||||
from TTS.utils.synthesis import synthesis
|
from TTS.utils.synthesis import synthesis
|
||||||
from TTS.utils.generic_utils import load_config, setup_model
|
from TTS.utils.generic_utils import setup_model
|
||||||
|
from TTS.utils.io import load_config
|
||||||
from TTS.utils.text.symbols import make_symbols, symbols, phonemes
|
from TTS.utils.text.symbols import make_symbols, symbols, phonemes
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
|
|
Binary file not shown.
|
@ -55,6 +55,8 @@
|
||||||
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
|
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
|
||||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||||
"use_gst": false,
|
"use_gst": false,
|
||||||
|
"double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
|
||||||
|
"ddc_r": 7, // reduction rate for coarse decoder.
|
||||||
|
|
||||||
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention.
|
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention.
|
||||||
"eval_batch_size":16,
|
"eval_batch_size":16,
|
||||||
|
|
|
@ -51,7 +51,7 @@ class TacotronTrainTest(unittest.TestCase):
|
||||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
|
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
|
||||||
input, input_lengths, mel_spec, speaker_ids)
|
input, input_lengths, mel_spec, mel_lengths, speaker_ids)
|
||||||
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
|
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
|
||||||
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
|
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
|
||||||
optimizer.zero_grad()
|
optimizer.zero_grad()
|
||||||
|
|
|
@ -66,7 +66,7 @@ class TacotronTrainTest(unittest.TestCase):
|
||||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||||
for _ in range(5):
|
for _ in range(5):
|
||||||
mel_out, linear_out, align, stop_tokens = model.forward(
|
mel_out, linear_out, align, stop_tokens = model.forward(
|
||||||
input_dummy, input_lengths, mel_spec, speaker_ids)
|
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids)
|
||||||
optimizer.zero_grad()
|
optimizer.zero_grad()
|
||||||
loss = criterion(mel_out, mel_spec, mel_lengths)
|
loss = criterion(mel_out, mel_spec, mel_lengths)
|
||||||
stop_loss = criterion_st(stop_tokens, stop_targets)
|
stop_loss = criterion_st(stop_tokens, stop_targets)
|
||||||
|
@ -95,6 +95,7 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
||||||
mel_spec = torch.rand(8, 120, c.audio['num_mels']).to(device)
|
mel_spec = torch.rand(8, 120, c.audio['num_mels']).to(device)
|
||||||
linear_spec = torch.rand(8, 120, c.audio['num_freq']).to(device)
|
linear_spec = torch.rand(8, 120, c.audio['num_freq']).to(device)
|
||||||
mel_lengths = torch.randint(20, 120, (8, )).long().to(device)
|
mel_lengths = torch.randint(20, 120, (8, )).long().to(device)
|
||||||
|
mel_lengths[-1] = 120
|
||||||
stop_targets = torch.zeros(8, 120, 1).float().to(device)
|
stop_targets = torch.zeros(8, 120, 1).float().to(device)
|
||||||
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
|
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
|
||||||
|
|
||||||
|
@ -130,7 +131,7 @@ class TacotronGSTTrainTest(unittest.TestCase):
|
||||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||||
for _ in range(10):
|
for _ in range(10):
|
||||||
mel_out, linear_out, align, stop_tokens = model.forward(
|
mel_out, linear_out, align, stop_tokens = model.forward(
|
||||||
input_dummy, input_lengths, mel_spec, speaker_ids)
|
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids)
|
||||||
optimizer.zero_grad()
|
optimizer.zero_grad()
|
||||||
loss = criterion(mel_out, mel_spec, mel_lengths)
|
loss = criterion(mel_out, mel_spec, mel_lengths)
|
||||||
stop_loss = criterion_st(stop_tokens, stop_targets)
|
stop_loss = criterion_st(stop_tokens, stop_targets)
|
||||||
|
|
177
train.py
177
train.py
|
@ -20,7 +20,8 @@ from TTS.utils.generic_utils import (count_parameters, create_experiment_folder,
|
||||||
from TTS.utils.io import (save_best_model, save_checkpoint,
|
from TTS.utils.io import (save_best_model, save_checkpoint,
|
||||||
load_config, copy_config_file)
|
load_config, copy_config_file)
|
||||||
from TTS.utils.training import (NoamLR, check_update, adam_weight_decay,
|
from TTS.utils.training import (NoamLR, check_update, adam_weight_decay,
|
||||||
gradual_training_scheduler, set_weight_decay)
|
gradual_training_scheduler, set_weight_decay,
|
||||||
|
setup_torch_training_env)
|
||||||
from TTS.utils.tensorboard_logger import TensorboardLogger
|
from TTS.utils.tensorboard_logger import TensorboardLogger
|
||||||
from TTS.utils.console_logger import ConsoleLogger
|
from TTS.utils.console_logger import ConsoleLogger
|
||||||
from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \
|
from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \
|
||||||
|
@ -32,13 +33,8 @@ from TTS.datasets.preprocess import load_meta_data
|
||||||
from TTS.utils.radam import RAdam
|
from TTS.utils.radam import RAdam
|
||||||
from TTS.utils.measures import alignment_diagonal_score
|
from TTS.utils.measures import alignment_diagonal_score
|
||||||
|
|
||||||
torch.backends.cudnn.enabled = True
|
|
||||||
torch.backends.cudnn.benchmark = False
|
use_cuda, num_gpus = setup_torch_training_env(True, False)
|
||||||
torch.manual_seed(54321)
|
|
||||||
use_cuda = torch.cuda.is_available()
|
|
||||||
num_gpus = torch.cuda.device_count()
|
|
||||||
print(" > Using CUDA: ", use_cuda)
|
|
||||||
print(" > Number of GPUs: ", num_gpus)
|
|
||||||
|
|
||||||
|
|
||||||
def setup_loader(ap, r, is_val=False, verbose=False):
|
def setup_loader(ap, r, is_val=False, verbose=False):
|
||||||
|
@ -123,21 +119,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
||||||
verbose=(epoch == 0))
|
verbose=(epoch == 0))
|
||||||
model.train()
|
model.train()
|
||||||
epoch_time = 0
|
epoch_time = 0
|
||||||
train_values = {
|
|
||||||
'avg_postnet_loss': 0,
|
|
||||||
'avg_decoder_loss': 0,
|
|
||||||
'avg_stopnet_loss': 0,
|
|
||||||
'avg_align_error': 0,
|
|
||||||
'avg_step_time': 0,
|
|
||||||
'avg_loader_time': 0
|
|
||||||
}
|
|
||||||
if c.bidirectional_decoder:
|
|
||||||
train_values['avg_decoder_b_loss'] = 0 # decoder backward loss
|
|
||||||
train_values['avg_decoder_c_loss'] = 0 # decoder consistency loss
|
|
||||||
if c.ga_alpha > 0:
|
|
||||||
train_values['avg_ga_loss'] = 0 # guidede attention loss
|
|
||||||
keep_avg = KeepAverage()
|
keep_avg = KeepAverage()
|
||||||
keep_avg.add_values(train_values)
|
|
||||||
if use_cuda:
|
if use_cuda:
|
||||||
batch_n_iter = int(
|
batch_n_iter = int(
|
||||||
len(data_loader.dataset) / (c.batch_size * num_gpus))
|
len(data_loader.dataset) / (c.batch_size * num_gpus))
|
||||||
|
@ -162,13 +144,14 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
||||||
optimizer_st.zero_grad()
|
optimizer_st.zero_grad()
|
||||||
|
|
||||||
# forward pass model
|
# forward pass model
|
||||||
if c.bidirectional_decoder:
|
if c.bidirectional_decoder or c.double_decoder_consistency:
|
||||||
decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
|
decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
|
||||||
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
|
text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids)
|
||||||
else:
|
else:
|
||||||
decoder_output, postnet_output, alignments, stop_tokens = model(
|
decoder_output, postnet_output, alignments, stop_tokens = model(
|
||||||
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
|
text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids)
|
||||||
decoder_backward_output = None
|
decoder_backward_output = None
|
||||||
|
alignments_backward = None
|
||||||
|
|
||||||
# set the alignment lengths wrt reduction factor for guided attention
|
# set the alignment lengths wrt reduction factor for guided attention
|
||||||
if mel_lengths.max() % model.decoder.r != 0:
|
if mel_lengths.max() % model.decoder.r != 0:
|
||||||
|
@ -180,12 +163,8 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
||||||
loss_dict = criterion(postnet_output, decoder_output, mel_input,
|
loss_dict = criterion(postnet_output, decoder_output, mel_input,
|
||||||
linear_input, stop_tokens, stop_targets,
|
linear_input, stop_tokens, stop_targets,
|
||||||
mel_lengths, decoder_backward_output,
|
mel_lengths, decoder_backward_output,
|
||||||
alignments, alignment_lengths, text_lengths)
|
alignments, alignment_lengths, alignments_backward,
|
||||||
if c.bidirectional_decoder:
|
text_lengths)
|
||||||
keep_avg.update_values({'avg_decoder_b_loss': loss_dict['decoder_backward_loss'].item(),
|
|
||||||
'avg_decoder_c_loss': loss_dict['decoder_c_loss'].item()})
|
|
||||||
if c.ga_alpha > 0:
|
|
||||||
keep_avg.update_values({'avg_ga_loss': loss_dict['ga_loss'].item()})
|
|
||||||
|
|
||||||
# backward pass
|
# backward pass
|
||||||
loss_dict['loss'].backward()
|
loss_dict['loss'].backward()
|
||||||
|
@ -195,7 +174,6 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
||||||
|
|
||||||
# compute alignment error (the lower the better )
|
# compute alignment error (the lower the better )
|
||||||
align_error = 1 - alignment_diagonal_score(alignments)
|
align_error = 1 - alignment_diagonal_score(alignments)
|
||||||
keep_avg.update_value('avg_align_error', align_error)
|
|
||||||
loss_dict['align_error'] = align_error
|
loss_dict['align_error'] = align_error
|
||||||
|
|
||||||
# backpass and check the grad norm for stop loss
|
# backpass and check the grad norm for stop loss
|
||||||
|
@ -210,23 +188,6 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
||||||
step_time = time.time() - start_time
|
step_time = time.time() - start_time
|
||||||
epoch_time += step_time
|
epoch_time += step_time
|
||||||
|
|
||||||
# update avg stats
|
|
||||||
update_train_values = {
|
|
||||||
'avg_postnet_loss': float(loss_dict['postnet_loss'].item()),
|
|
||||||
'avg_decoder_loss': float(loss_dict['decoder_loss'].item()),
|
|
||||||
'avg_stopnet_loss': loss_dict['stopnet_loss'].item() \
|
|
||||||
if isinstance(loss_dict['stopnet_loss'], float) else float(loss_dict['stopnet_loss'].item()),
|
|
||||||
'avg_step_time': step_time,
|
|
||||||
'avg_loader_time': loader_time
|
|
||||||
}
|
|
||||||
keep_avg.update_values(update_train_values)
|
|
||||||
|
|
||||||
if global_step % c.print_step == 0:
|
|
||||||
c_logger.print_train_step(batch_n_iter, num_iter, global_step,
|
|
||||||
avg_spec_length, avg_text_length,
|
|
||||||
step_time, loader_time, current_lr,
|
|
||||||
loss_dict, keep_avg.avg_values)
|
|
||||||
|
|
||||||
# aggregate losses from processes
|
# aggregate losses from processes
|
||||||
if num_gpus > 1:
|
if num_gpus > 1:
|
||||||
loss_dict['postnet_loss'] = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus)
|
loss_dict['postnet_loss'] = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus)
|
||||||
|
@ -234,18 +195,41 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
||||||
loss_dict['loss'] = reduce_tensor(loss_dict['loss'] .data, num_gpus)
|
loss_dict['loss'] = reduce_tensor(loss_dict['loss'] .data, num_gpus)
|
||||||
loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus) if c.stopnet else loss_dict['stopnet_loss']
|
loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus) if c.stopnet else loss_dict['stopnet_loss']
|
||||||
|
|
||||||
|
# detach loss values
|
||||||
|
loss_dict_new = dict()
|
||||||
|
for key, value in loss_dict.items():
|
||||||
|
if isinstance(value, (int, float)):
|
||||||
|
loss_dict_new[key] = value
|
||||||
|
else:
|
||||||
|
loss_dict_new[key] = value.item()
|
||||||
|
loss_dict = loss_dict_new
|
||||||
|
|
||||||
|
# update avg stats
|
||||||
|
update_train_values = dict()
|
||||||
|
for key, value in loss_dict.items():
|
||||||
|
update_train_values['avg_' + key] = value
|
||||||
|
update_train_values['avg_loader_time'] = loader_time
|
||||||
|
update_train_values['avg_step_time'] = step_time
|
||||||
|
keep_avg.update_values(update_train_values)
|
||||||
|
|
||||||
|
# print training progress
|
||||||
|
if global_step % c.print_step == 0:
|
||||||
|
c_logger.print_train_step(batch_n_iter, num_iter, global_step,
|
||||||
|
avg_spec_length, avg_text_length,
|
||||||
|
step_time, loader_time, current_lr,
|
||||||
|
loss_dict, keep_avg.avg_values)
|
||||||
|
|
||||||
if args.rank == 0:
|
if args.rank == 0:
|
||||||
# Plot Training Iter Stats
|
# Plot Training Iter Stats
|
||||||
# reduce TB load
|
# reduce TB load
|
||||||
if global_step % 10 == 0:
|
if global_step % c.tb_plot_step == 0:
|
||||||
iter_stats = {
|
iter_stats = {
|
||||||
"loss_posnet": loss_dict['postnet_loss'].item(),
|
|
||||||
"loss_decoder": loss_dict['decoder_loss'].item(),
|
|
||||||
"lr": current_lr,
|
"lr": current_lr,
|
||||||
"grad_norm": grad_norm,
|
"grad_norm": grad_norm,
|
||||||
"grad_norm_st": grad_norm_st,
|
"grad_norm_st": grad_norm_st,
|
||||||
"step_time": step_time
|
"step_time": step_time
|
||||||
}
|
}
|
||||||
|
iter_stats.update(loss_dict)
|
||||||
tb_logger.tb_train_iter_stats(global_step, iter_stats)
|
tb_logger.tb_train_iter_stats(global_step, iter_stats)
|
||||||
|
|
||||||
if global_step % c.save_step == 0:
|
if global_step % c.save_step == 0:
|
||||||
|
@ -253,7 +237,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
||||||
# save model
|
# save model
|
||||||
save_checkpoint(model, optimizer, global_step, epoch, model.decoder.r, OUT_PATH,
|
save_checkpoint(model, optimizer, global_step, epoch, model.decoder.r, OUT_PATH,
|
||||||
optimizer_st=optimizer_st,
|
optimizer_st=optimizer_st,
|
||||||
model_loss=loss_dict['postnet_loss'].item())
|
model_loss=loss_dict['postnet_loss'])
|
||||||
|
|
||||||
# Diagnostic visualizations
|
# Diagnostic visualizations
|
||||||
const_spec = postnet_output[0].data.cpu().numpy()
|
const_spec = postnet_output[0].data.cpu().numpy()
|
||||||
|
@ -268,7 +252,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
||||||
"alignment": plot_alignment(align_img),
|
"alignment": plot_alignment(align_img),
|
||||||
}
|
}
|
||||||
|
|
||||||
if c.bidirectional_decoder:
|
if c.bidirectional_decoder or c.double_decoder_consistency:
|
||||||
figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy())
|
figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy())
|
||||||
|
|
||||||
tb_logger.tb_train_figures(global_step, figures)
|
tb_logger.tb_train_figures(global_step, figures)
|
||||||
|
@ -288,16 +272,8 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
||||||
|
|
||||||
# Plot Epoch Stats
|
# Plot Epoch Stats
|
||||||
if args.rank == 0:
|
if args.rank == 0:
|
||||||
# Plot Training Epoch Stats
|
epoch_stats = {"epoch_time": epoch_time}
|
||||||
epoch_stats = {
|
epoch_stats.update(keep_avg.avg_values)
|
||||||
"loss_postnet": keep_avg['avg_postnet_loss'],
|
|
||||||
"loss_decoder": keep_avg['avg_decoder_loss'],
|
|
||||||
"stopnet_loss": keep_avg['avg_stopnet_loss'],
|
|
||||||
"alignment_score": keep_avg['avg_align_error'],
|
|
||||||
"epoch_time": epoch_time
|
|
||||||
}
|
|
||||||
if c.ga_alpha > 0:
|
|
||||||
epoch_stats['guided_attention_loss'] = keep_avg['avg_ga_loss']
|
|
||||||
tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
|
tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
|
||||||
if c.tb_model_param_stats:
|
if c.tb_model_param_stats:
|
||||||
tb_logger.tb_model_weights(model, global_step)
|
tb_logger.tb_model_weights(model, global_step)
|
||||||
|
@ -309,20 +285,7 @@ def evaluate(model, criterion, ap, global_step, epoch):
|
||||||
data_loader = setup_loader(ap, model.decoder.r, is_val=True)
|
data_loader = setup_loader(ap, model.decoder.r, is_val=True)
|
||||||
model.eval()
|
model.eval()
|
||||||
epoch_time = 0
|
epoch_time = 0
|
||||||
eval_values_dict = {
|
|
||||||
'avg_postnet_loss': 0,
|
|
||||||
'avg_decoder_loss': 0,
|
|
||||||
'avg_stopnet_loss': 0,
|
|
||||||
'avg_align_error': 0
|
|
||||||
}
|
|
||||||
if c.bidirectional_decoder:
|
|
||||||
eval_values_dict['avg_decoder_b_loss'] = 0 # decoder backward loss
|
|
||||||
eval_values_dict['avg_decoder_c_loss'] = 0 # decoder consistency loss
|
|
||||||
if c.ga_alpha > 0:
|
|
||||||
eval_values_dict['avg_ga_loss'] = 0 # guidede attention loss
|
|
||||||
keep_avg = KeepAverage()
|
keep_avg = KeepAverage()
|
||||||
keep_avg.add_values(eval_values_dict)
|
|
||||||
|
|
||||||
c_logger.print_eval_start()
|
c_logger.print_eval_start()
|
||||||
if data_loader is not None:
|
if data_loader is not None:
|
||||||
for num_iter, data in enumerate(data_loader):
|
for num_iter, data in enumerate(data_loader):
|
||||||
|
@ -333,13 +296,14 @@ def evaluate(model, criterion, ap, global_step, epoch):
|
||||||
assert mel_input.shape[1] % model.decoder.r == 0
|
assert mel_input.shape[1] % model.decoder.r == 0
|
||||||
|
|
||||||
# forward pass model
|
# forward pass model
|
||||||
if c.bidirectional_decoder:
|
if c.bidirectional_decoder or c.double_decoder_consistency:
|
||||||
decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
|
decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model(
|
||||||
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
|
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
|
||||||
else:
|
else:
|
||||||
decoder_output, postnet_output, alignments, stop_tokens = model(
|
decoder_output, postnet_output, alignments, stop_tokens = model(
|
||||||
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
|
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
|
||||||
decoder_backward_output = None
|
decoder_backward_output = None
|
||||||
|
alignments_backward = None
|
||||||
|
|
||||||
# set the alignment lengths wrt reduction factor for guided attention
|
# set the alignment lengths wrt reduction factor for guided attention
|
||||||
if mel_lengths.max() % model.decoder.r != 0:
|
if mel_lengths.max() % model.decoder.r != 0:
|
||||||
|
@ -351,12 +315,8 @@ def evaluate(model, criterion, ap, global_step, epoch):
|
||||||
loss_dict = criterion(postnet_output, decoder_output, mel_input,
|
loss_dict = criterion(postnet_output, decoder_output, mel_input,
|
||||||
linear_input, stop_tokens, stop_targets,
|
linear_input, stop_tokens, stop_targets,
|
||||||
mel_lengths, decoder_backward_output,
|
mel_lengths, decoder_backward_output,
|
||||||
alignments, alignment_lengths, text_lengths)
|
alignments, alignment_lengths, alignments_backward,
|
||||||
if c.bidirectional_decoder:
|
text_lengths)
|
||||||
keep_avg.update_values({'avg_decoder_b_loss': loss_dict['decoder_b_loss'].item(),
|
|
||||||
'avg_decoder_c_loss': loss_dict['decoder_c_loss'].item()})
|
|
||||||
if c.ga_alpha > 0:
|
|
||||||
keep_avg.update_values({'avg_ga_loss': loss_dict['ga_loss'].item()})
|
|
||||||
|
|
||||||
# step time
|
# step time
|
||||||
step_time = time.time() - start_time
|
step_time = time.time() - start_time
|
||||||
|
@ -364,7 +324,7 @@ def evaluate(model, criterion, ap, global_step, epoch):
|
||||||
|
|
||||||
# compute alignment score
|
# compute alignment score
|
||||||
align_error = 1 - alignment_diagonal_score(alignments)
|
align_error = 1 - alignment_diagonal_score(alignments)
|
||||||
keep_avg.update_value('avg_align_error', align_error)
|
loss_dict['align_error'] = align_error
|
||||||
|
|
||||||
# aggregate losses from processes
|
# aggregate losses from processes
|
||||||
if num_gpus > 1:
|
if num_gpus > 1:
|
||||||
|
@ -373,14 +333,20 @@ def evaluate(model, criterion, ap, global_step, epoch):
|
||||||
if c.stopnet:
|
if c.stopnet:
|
||||||
loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus)
|
loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus)
|
||||||
|
|
||||||
keep_avg.update_values({
|
# detach loss values
|
||||||
'avg_postnet_loss':
|
loss_dict_new = dict()
|
||||||
float(loss_dict['postnet_loss'].item()),
|
for key, value in loss_dict.items():
|
||||||
'avg_decoder_loss':
|
if isinstance(value, (int, float)):
|
||||||
float(loss_dict['decoder_loss'].item()),
|
loss_dict_new[key] = value
|
||||||
'avg_stopnet_loss':
|
else:
|
||||||
float(loss_dict['stopnet_loss'].item()),
|
loss_dict_new[key] = value.item()
|
||||||
})
|
loss_dict = loss_dict_new
|
||||||
|
|
||||||
|
# update avg stats
|
||||||
|
update_train_values = dict()
|
||||||
|
for key, value in loss_dict.items():
|
||||||
|
update_train_values['avg_' + key] = value
|
||||||
|
keep_avg.update_values(update_train_values)
|
||||||
|
|
||||||
if c.print_eval:
|
if c.print_eval:
|
||||||
c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
|
c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
|
||||||
|
@ -409,20 +375,11 @@ def evaluate(model, criterion, ap, global_step, epoch):
|
||||||
c.audio["sample_rate"])
|
c.audio["sample_rate"])
|
||||||
|
|
||||||
# Plot Validation Stats
|
# Plot Validation Stats
|
||||||
epoch_stats = {
|
|
||||||
"loss_postnet": keep_avg['avg_postnet_loss'],
|
|
||||||
"loss_decoder": keep_avg['avg_decoder_loss'],
|
|
||||||
"stopnet_loss": keep_avg['avg_stopnet_loss'],
|
|
||||||
"alignment_score": keep_avg['avg_align_error'],
|
|
||||||
}
|
|
||||||
|
|
||||||
if c.bidirectional_decoder:
|
if c.bidirectional_decoder or c.double_decoder_consistency:
|
||||||
epoch_stats['loss_decoder_backward'] = keep_avg['avg_decoder_b_loss']
|
|
||||||
align_b_img = alignments_backward[idx].data.cpu().numpy()
|
align_b_img = alignments_backward[idx].data.cpu().numpy()
|
||||||
eval_figures['alignment_backward'] = plot_alignment(align_b_img)
|
eval_figures['alignment2'] = plot_alignment(align_b_img)
|
||||||
if c.ga_alpha > 0:
|
tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
|
||||||
epoch_stats['guided_attention_loss'] = keep_avg['avg_ga_loss']
|
|
||||||
tb_logger.tb_eval_stats(global_step, epoch_stats)
|
|
||||||
tb_logger.tb_eval_figures(global_step, eval_figures)
|
tb_logger.tb_eval_figures(global_step, eval_figures)
|
||||||
|
|
||||||
if args.rank == 0 and epoch > c.test_delay_epochs:
|
if args.rank == 0 and epoch > c.test_delay_epochs:
|
||||||
|
@ -431,7 +388,8 @@ def evaluate(model, criterion, ap, global_step, epoch):
|
||||||
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
||||||
"Be a voice, not an echo.",
|
"Be a voice, not an echo.",
|
||||||
"I'm sorry Dave. I'm afraid I can't do that.",
|
"I'm sorry Dave. I'm afraid I can't do that.",
|
||||||
"This cake is great. It's so delicious and moist."
|
"This cake is great. It's so delicious and moist.",
|
||||||
|
"Prior to November 22, 1963."
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
with open(c.test_sentences_file, "r") as f:
|
with open(c.test_sentences_file, "r") as f:
|
||||||
|
@ -516,8 +474,6 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
|
|
||||||
model = setup_model(num_chars, num_speakers, c)
|
model = setup_model(num_chars, num_speakers, c)
|
||||||
|
|
||||||
print(" | > Num output units : {}".format(ap.num_freq), flush=True)
|
|
||||||
|
|
||||||
params = set_weight_decay(model, c.wd)
|
params = set_weight_decay(model, c.wd)
|
||||||
optimizer = RAdam(params, lr=c.lr, weight_decay=0)
|
optimizer = RAdam(params, lr=c.lr, weight_decay=0)
|
||||||
if c.stopnet and c.separate_stopnet:
|
if c.stopnet and c.separate_stopnet:
|
||||||
|
@ -542,7 +498,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
except:
|
except:
|
||||||
print(" > Partial model initialization.")
|
print(" > Partial model initialization.")
|
||||||
model_dict = model.state_dict()
|
model_dict = model.state_dict()
|
||||||
model_dict = set_init_dict(model_dict, checkpoint, c)
|
model_dict = set_init_dict(model_dict, checkpoint['model'], c)
|
||||||
model.load_state_dict(model_dict)
|
model.load_state_dict(model_dict)
|
||||||
del model_dict
|
del model_dict
|
||||||
for group in optimizer.param_groups:
|
for group in optimizer.param_groups:
|
||||||
|
@ -585,7 +541,6 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
if c.bidirectional_decoder:
|
if c.bidirectional_decoder:
|
||||||
model.decoder_backward.set_r(r)
|
model.decoder_backward.set_r(r)
|
||||||
print("\n > Number of output frames:", model.decoder.r)
|
print("\n > Number of output frames:", model.decoder.r)
|
||||||
|
|
||||||
train_avg_loss_dict, global_step = train(model, criterion, optimizer,
|
train_avg_loss_dict, global_step = train(model, criterion, optimizer,
|
||||||
optimizer_st, scheduler, ap,
|
optimizer_st, scheduler, ap,
|
||||||
global_step, epoch)
|
global_step, epoch)
|
||||||
|
@ -667,7 +622,7 @@ if __name__ == '__main__':
|
||||||
os.chmod(OUT_PATH, 0o775)
|
os.chmod(OUT_PATH, 0o775)
|
||||||
|
|
||||||
LOG_DIR = OUT_PATH
|
LOG_DIR = OUT_PATH
|
||||||
tb_logger = TensorboardLogger(LOG_DIR)
|
tb_logger = TensorboardLogger(LOG_DIR, model_name='TTS')
|
||||||
|
|
||||||
# write model desc to tensorboard
|
# write model desc to tensorboard
|
||||||
tb_logger.tb_add_text('model-description', c['run_description'], 0)
|
tb_logger.tb_add_text('model-description', c['run_description'], 0)
|
||||||
|
|
|
@ -17,7 +17,7 @@ class AudioProcessor(object):
|
||||||
hop_length=None,
|
hop_length=None,
|
||||||
win_length=None,
|
win_length=None,
|
||||||
ref_level_db=None,
|
ref_level_db=None,
|
||||||
num_freq=None,
|
fft_size=1024,
|
||||||
power=None,
|
power=None,
|
||||||
preemphasis=0.0,
|
preemphasis=0.0,
|
||||||
signal_norm=None,
|
signal_norm=None,
|
||||||
|
@ -25,6 +25,8 @@ class AudioProcessor(object):
|
||||||
max_norm=None,
|
max_norm=None,
|
||||||
mel_fmin=None,
|
mel_fmin=None,
|
||||||
mel_fmax=None,
|
mel_fmax=None,
|
||||||
|
spec_gain=20,
|
||||||
|
stft_pad_mode='reflect',
|
||||||
clip_norm=True,
|
clip_norm=True,
|
||||||
griffin_lim_iters=None,
|
griffin_lim_iters=None,
|
||||||
do_trim_silence=False,
|
do_trim_silence=False,
|
||||||
|
@ -41,7 +43,7 @@ class AudioProcessor(object):
|
||||||
self.frame_shift_ms = frame_shift_ms
|
self.frame_shift_ms = frame_shift_ms
|
||||||
self.frame_length_ms = frame_length_ms
|
self.frame_length_ms = frame_length_ms
|
||||||
self.ref_level_db = ref_level_db
|
self.ref_level_db = ref_level_db
|
||||||
self.num_freq = num_freq
|
self.fft_size = fft_size
|
||||||
self.power = power
|
self.power = power
|
||||||
self.preemphasis = preemphasis
|
self.preemphasis = preemphasis
|
||||||
self.griffin_lim_iters = griffin_lim_iters
|
self.griffin_lim_iters = griffin_lim_iters
|
||||||
|
@ -49,6 +51,8 @@ class AudioProcessor(object):
|
||||||
self.symmetric_norm = symmetric_norm
|
self.symmetric_norm = symmetric_norm
|
||||||
self.mel_fmin = mel_fmin or 0
|
self.mel_fmin = mel_fmin or 0
|
||||||
self.mel_fmax = mel_fmax
|
self.mel_fmax = mel_fmax
|
||||||
|
self.spec_gain = float(spec_gain)
|
||||||
|
self.stft_pad_mode = 'reflect'
|
||||||
self.max_norm = 1.0 if max_norm is None else float(max_norm)
|
self.max_norm = 1.0 if max_norm is None else float(max_norm)
|
||||||
self.clip_norm = clip_norm
|
self.clip_norm = clip_norm
|
||||||
self.do_trim_silence = do_trim_silence
|
self.do_trim_silence = do_trim_silence
|
||||||
|
@ -57,11 +61,12 @@ class AudioProcessor(object):
|
||||||
self.stats_path = stats_path
|
self.stats_path = stats_path
|
||||||
# setup stft parameters
|
# setup stft parameters
|
||||||
if hop_length is None:
|
if hop_length is None:
|
||||||
self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
|
# compute stft parameters from given time values
|
||||||
|
self.hop_length, self.win_length = self._stft_parameters()
|
||||||
else:
|
else:
|
||||||
|
# use stft parameters from config file
|
||||||
self.hop_length = hop_length
|
self.hop_length = hop_length
|
||||||
self.win_length = win_length
|
self.win_length = win_length
|
||||||
self.n_fft = (self.num_freq - 1) * 2
|
|
||||||
assert min_level_db != 0.0, " [!] min_level_db is 0"
|
assert min_level_db != 0.0, " [!] min_level_db is 0"
|
||||||
members = vars(self)
|
members = vars(self)
|
||||||
for key, value in members.items():
|
for key, value in members.items():
|
||||||
|
@ -84,19 +89,18 @@ class AudioProcessor(object):
|
||||||
assert self.mel_fmax <= self.sample_rate // 2
|
assert self.mel_fmax <= self.sample_rate // 2
|
||||||
return librosa.filters.mel(
|
return librosa.filters.mel(
|
||||||
self.sample_rate,
|
self.sample_rate,
|
||||||
self.n_fft,
|
self.fft_size,
|
||||||
n_mels=self.num_mels,
|
n_mels=self.num_mels,
|
||||||
fmin=self.mel_fmin,
|
fmin=self.mel_fmin,
|
||||||
fmax=self.mel_fmax)
|
fmax=self.mel_fmax)
|
||||||
|
|
||||||
def _stft_parameters(self, ):
|
def _stft_parameters(self, ):
|
||||||
"""Compute necessary stft parameters with given time values"""
|
"""Compute necessary stft parameters with given time values"""
|
||||||
n_fft = (self.num_freq - 1) * 2
|
|
||||||
factor = self.frame_length_ms / self.frame_shift_ms
|
factor = self.frame_length_ms / self.frame_shift_ms
|
||||||
assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms"
|
assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms"
|
||||||
hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
|
hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
|
||||||
win_length = int(hop_length * factor)
|
win_length = int(hop_length * factor)
|
||||||
return n_fft, hop_length, win_length
|
return hop_length, win_length
|
||||||
|
|
||||||
### normalization ###
|
### normalization ###
|
||||||
def _normalize(self, S):
|
def _normalize(self, S):
|
||||||
|
@ -108,7 +112,7 @@ class AudioProcessor(object):
|
||||||
if hasattr(self, 'mel_scaler'):
|
if hasattr(self, 'mel_scaler'):
|
||||||
if S.shape[0] == self.num_mels:
|
if S.shape[0] == self.num_mels:
|
||||||
return self.mel_scaler.transform(S.T).T
|
return self.mel_scaler.transform(S.T).T
|
||||||
elif S.shape[0] == self.n_fft / 2:
|
elif S.shape[0] == self.fft_size / 2:
|
||||||
return self.linear_scaler.transform(S.T).T
|
return self.linear_scaler.transform(S.T).T
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
|
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
|
||||||
|
@ -137,7 +141,7 @@ class AudioProcessor(object):
|
||||||
if hasattr(self, 'mel_scaler'):
|
if hasattr(self, 'mel_scaler'):
|
||||||
if S_denorm.shape[0] == self.num_mels:
|
if S_denorm.shape[0] == self.num_mels:
|
||||||
return self.mel_scaler.inverse_transform(S_denorm.T).T
|
return self.mel_scaler.inverse_transform(S_denorm.T).T
|
||||||
elif S_denorm.shape[0] == self.n_fft / 2:
|
elif S_denorm.shape[0] == self.fft_size / 2:
|
||||||
return self.linear_scaler.inverse_transform(S_denorm.T).T
|
return self.linear_scaler.inverse_transform(S_denorm.T).T
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
|
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
|
||||||
|
@ -182,11 +186,11 @@ class AudioProcessor(object):
|
||||||
### DB and AMP conversion ###
|
### DB and AMP conversion ###
|
||||||
# pylint: disable=no-self-use
|
# pylint: disable=no-self-use
|
||||||
def _amp_to_db(self, x):
|
def _amp_to_db(self, x):
|
||||||
return 20 * np.log10(np.maximum(1e-5, x))
|
return self.spec_gain * np.log10(np.maximum(1e-5, x))
|
||||||
|
|
||||||
# pylint: disable=no-self-use
|
# pylint: disable=no-self-use
|
||||||
def _db_to_amp(self, x):
|
def _db_to_amp(self, x):
|
||||||
return np.power(10.0, x * 0.05)
|
return np.power(10.0, x / self.spec_gain)
|
||||||
|
|
||||||
### Preemphasis ###
|
### Preemphasis ###
|
||||||
def apply_preemphasis(self, x):
|
def apply_preemphasis(self, x):
|
||||||
|
@ -252,10 +256,10 @@ class AudioProcessor(object):
|
||||||
def _stft(self, y):
|
def _stft(self, y):
|
||||||
return librosa.stft(
|
return librosa.stft(
|
||||||
y=y,
|
y=y,
|
||||||
n_fft=self.n_fft,
|
n_fft=self.fft_size,
|
||||||
hop_length=self.hop_length,
|
hop_length=self.hop_length,
|
||||||
win_length=self.win_length,
|
win_length=self.win_length,
|
||||||
pad_mode='constant'
|
pad_mode=self.stft_pad_mode,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _istft(self, y):
|
def _istft(self, y):
|
||||||
|
|
|
@ -87,7 +87,7 @@ class ConsoleLogger():
|
||||||
diff = 0
|
diff = 0
|
||||||
if self.old_eval_loss_dict is not None:
|
if self.old_eval_loss_dict is not None:
|
||||||
diff = value - self.old_eval_loss_dict[key]
|
diff = value - self.old_eval_loss_dict[key]
|
||||||
if diff < 0:
|
if diff <= 0:
|
||||||
color = tcolors.OKGREEN
|
color = tcolors.OKGREEN
|
||||||
sign = ''
|
sign = ''
|
||||||
log_text += "{}{}:{} {:.5f} {}({}{:.5f})\n".format(indent, key, color, value, tcolors.ENDC, sign, diff)
|
log_text += "{}{}:{} {:.5f} {}({}{:.5f})\n".format(indent, key, color, value, tcolors.ENDC, sign, diff)
|
||||||
|
|
|
@ -75,6 +75,7 @@ def split_dataset(items):
|
||||||
is_multi_speaker = len(set(speakers)) > 1
|
is_multi_speaker = len(set(speakers)) > 1
|
||||||
eval_split_size = 500 if len(items) * 0.01 > 500 else int(
|
eval_split_size = 500 if len(items) * 0.01 > 500 else int(
|
||||||
len(items) * 0.01)
|
len(items) * 0.01)
|
||||||
|
assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples."
|
||||||
np.random.seed(0)
|
np.random.seed(0)
|
||||||
np.random.shuffle(items)
|
np.random.shuffle(items)
|
||||||
if is_multi_speaker:
|
if is_multi_speaker:
|
||||||
|
@ -106,15 +107,15 @@ def sequence_mask(sequence_length, max_len=None):
|
||||||
return seq_range_expand < seq_length_expand
|
return seq_range_expand < seq_length_expand
|
||||||
|
|
||||||
|
|
||||||
def set_init_dict(model_dict, checkpoint, c):
|
def set_init_dict(model_dict, checkpoint_state, c):
|
||||||
# Partial initialization: if there is a mismatch with new and old layer, it is skipped.
|
# Partial initialization: if there is a mismatch with new and old layer, it is skipped.
|
||||||
for k, v in checkpoint['model'].items():
|
for k, v in checkpoint_state.items():
|
||||||
if k not in model_dict:
|
if k not in model_dict:
|
||||||
print(" | > Layer missing in the model definition: {}".format(k))
|
print(" | > Layer missing in the model definition: {}".format(k))
|
||||||
# 1. filter out unnecessary keys
|
# 1. filter out unnecessary keys
|
||||||
pretrained_dict = {
|
pretrained_dict = {
|
||||||
k: v
|
k: v
|
||||||
for k, v in checkpoint['model'].items() if k in model_dict
|
for k, v in checkpoint_state.items() if k in model_dict
|
||||||
}
|
}
|
||||||
# 2. filter out different size layers
|
# 2. filter out different size layers
|
||||||
pretrained_dict = {
|
pretrained_dict = {
|
||||||
|
@ -145,7 +146,7 @@ def setup_model(num_chars, num_speakers, c):
|
||||||
model = MyModel(num_chars=num_chars,
|
model = MyModel(num_chars=num_chars,
|
||||||
num_speakers=num_speakers,
|
num_speakers=num_speakers,
|
||||||
r=c.r,
|
r=c.r,
|
||||||
postnet_output_dim=c.audio['num_freq'],
|
postnet_output_dim=int(c.audio['fft_size'] / 2 + 1),
|
||||||
decoder_output_dim=c.audio['num_mels'],
|
decoder_output_dim=c.audio['num_mels'],
|
||||||
gst=c.use_gst,
|
gst=c.use_gst,
|
||||||
memory_size=c.memory_size,
|
memory_size=c.memory_size,
|
||||||
|
@ -160,13 +161,16 @@ def setup_model(num_chars, num_speakers, c):
|
||||||
location_attn=c.location_attn,
|
location_attn=c.location_attn,
|
||||||
attn_K=c.attention_heads,
|
attn_K=c.attention_heads,
|
||||||
separate_stopnet=c.separate_stopnet,
|
separate_stopnet=c.separate_stopnet,
|
||||||
bidirectional_decoder=c.bidirectional_decoder)
|
bidirectional_decoder=c.bidirectional_decoder,
|
||||||
|
double_decoder_consistency=c.double_decoder_consistency,
|
||||||
|
ddc_r=c.ddc_r)
|
||||||
elif c.model.lower() == "tacotron2":
|
elif c.model.lower() == "tacotron2":
|
||||||
model = MyModel(num_chars=num_chars,
|
model = MyModel(num_chars=num_chars,
|
||||||
num_speakers=num_speakers,
|
num_speakers=num_speakers,
|
||||||
r=c.r,
|
r=c.r,
|
||||||
postnet_output_dim=c.audio['num_mels'],
|
postnet_output_dim=c.audio['num_mels'],
|
||||||
decoder_output_dim=c.audio['num_mels'],
|
decoder_output_dim=c.audio['num_mels'],
|
||||||
|
gst=c.use_gst,
|
||||||
attn_type=c.attention_type,
|
attn_type=c.attention_type,
|
||||||
attn_win=c.windowing,
|
attn_win=c.windowing,
|
||||||
attn_norm=c.attention_norm,
|
attn_norm=c.attention_norm,
|
||||||
|
@ -178,7 +182,9 @@ def setup_model(num_chars, num_speakers, c):
|
||||||
location_attn=c.location_attn,
|
location_attn=c.location_attn,
|
||||||
attn_K=c.attention_heads,
|
attn_K=c.attention_heads,
|
||||||
separate_stopnet=c.separate_stopnet,
|
separate_stopnet=c.separate_stopnet,
|
||||||
bidirectional_decoder=c.bidirectional_decoder)
|
bidirectional_decoder=c.bidirectional_decoder,
|
||||||
|
double_decoder_consistency=c.double_decoder_consistency,
|
||||||
|
ddc_r=c.ddc_r)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
class KeepAverage():
|
class KeepAverage():
|
||||||
|
@ -197,6 +203,11 @@ class KeepAverage():
|
||||||
self.iters[name] = init_iter
|
self.iters[name] = init_iter
|
||||||
|
|
||||||
def update_value(self, name, value, weighted_avg=False):
|
def update_value(self, name, value, weighted_avg=False):
|
||||||
|
if name not in self.avg_values:
|
||||||
|
# add value if not exist before
|
||||||
|
self.add_value(name, init_val=value)
|
||||||
|
else:
|
||||||
|
# else update existing value
|
||||||
if weighted_avg:
|
if weighted_avg:
|
||||||
self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value
|
self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value
|
||||||
self.iters[name] += 1
|
self.iters[name] += 1
|
||||||
|
@ -241,7 +252,7 @@ def check_config(c):
|
||||||
|
|
||||||
# audio processing parameters
|
# audio processing parameters
|
||||||
_check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056)
|
_check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056)
|
||||||
_check_argument('num_freq', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058)
|
_check_argument('fft_size', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058)
|
||||||
_check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000)
|
_check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000)
|
||||||
_check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length')
|
_check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length')
|
||||||
_check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length')
|
_check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length')
|
||||||
|
@ -267,6 +278,7 @@ def check_config(c):
|
||||||
_check_argument('clip_norm', c['audio'], restricted=True, val_type=bool)
|
_check_argument('clip_norm', c['audio'], restricted=True, val_type=bool)
|
||||||
_check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000)
|
_check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000)
|
||||||
_check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0)
|
_check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0)
|
||||||
|
_check_argument('spec_gain', c['audio'], restricted=True, val_type=float, min_val=1, max_val=100)
|
||||||
_check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool)
|
_check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool)
|
||||||
_check_argument('trim_db', c['audio'], restricted=True, val_type=int)
|
_check_argument('trim_db', c['audio'], restricted=True, val_type=int)
|
||||||
|
|
||||||
|
@ -308,6 +320,8 @@ def check_config(c):
|
||||||
_check_argument('transition_agent', c, restricted=True, val_type=bool)
|
_check_argument('transition_agent', c, restricted=True, val_type=bool)
|
||||||
_check_argument('location_attn', c, restricted=True, val_type=bool)
|
_check_argument('location_attn', c, restricted=True, val_type=bool)
|
||||||
_check_argument('bidirectional_decoder', c, restricted=True, val_type=bool)
|
_check_argument('bidirectional_decoder', c, restricted=True, val_type=bool)
|
||||||
|
_check_argument('double_decoder_consistency', c, restricted=True, val_type=bool)
|
||||||
|
_check_argument('ddc_r', c, restricted='double_decoder_consistency' in c.keys(), min_val=1, max_val=7, val_type=int)
|
||||||
|
|
||||||
# stopnet
|
# stopnet
|
||||||
_check_argument('stopnet', c, restricted=True, val_type=bool)
|
_check_argument('stopnet', c, restricted=True, val_type=bool)
|
||||||
|
@ -315,6 +329,7 @@ def check_config(c):
|
||||||
|
|
||||||
# tensorboard
|
# tensorboard
|
||||||
_check_argument('print_step', c, restricted=True, val_type=int, min_val=1)
|
_check_argument('print_step', c, restricted=True, val_type=int, min_val=1)
|
||||||
|
_check_argument('tb_plot_step', c, restricted=True, val_type=int, min_val=1)
|
||||||
_check_argument('save_step', c, restricted=True, val_type=int, min_val=1)
|
_check_argument('save_step', c, restricted=True, val_type=int, min_val=1)
|
||||||
_check_argument('checkpoint', c, restricted=True, val_type=bool)
|
_check_argument('checkpoint', c, restricted=True, val_type=bool)
|
||||||
_check_argument('tb_model_param_stats', c, restricted=True, val_type=bool)
|
_check_argument('tb_model_param_stats', c, restricted=True, val_type=bool)
|
||||||
|
|
|
@ -3,7 +3,8 @@ from tensorboardX import SummaryWriter
|
||||||
|
|
||||||
|
|
||||||
class TensorboardLogger(object):
|
class TensorboardLogger(object):
|
||||||
def __init__(self, log_dir):
|
def __init__(self, log_dir, model_name):
|
||||||
|
self.model_name = model_name
|
||||||
self.writer = SummaryWriter(log_dir)
|
self.writer = SummaryWriter(log_dir)
|
||||||
self.train_stats = {}
|
self.train_stats = {}
|
||||||
self.eval_stats = {}
|
self.eval_stats = {}
|
||||||
|
@ -50,31 +51,31 @@ class TensorboardLogger(object):
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
def tb_train_iter_stats(self, step, stats):
|
def tb_train_iter_stats(self, step, stats):
|
||||||
self.dict_to_tb_scalar("TrainIterStats", stats, step)
|
self.dict_to_tb_scalar(f"{self.model_name}_TrainIterStats", stats, step)
|
||||||
|
|
||||||
def tb_train_epoch_stats(self, step, stats):
|
def tb_train_epoch_stats(self, step, stats):
|
||||||
self.dict_to_tb_scalar("TrainEpochStats", stats, step)
|
self.dict_to_tb_scalar(f"{self.model_name}_TrainEpochStats", stats, step)
|
||||||
|
|
||||||
def tb_train_figures(self, step, figures):
|
def tb_train_figures(self, step, figures):
|
||||||
self.dict_to_tb_figure("TrainFigures", figures, step)
|
self.dict_to_tb_figure(f"{self.model_name}_TrainFigures", figures, step)
|
||||||
|
|
||||||
def tb_train_audios(self, step, audios, sample_rate):
|
def tb_train_audios(self, step, audios, sample_rate):
|
||||||
self.dict_to_tb_audios("TrainAudios", audios, step, sample_rate)
|
self.dict_to_tb_audios(f"{self.model_name}_TrainAudios", audios, step, sample_rate)
|
||||||
|
|
||||||
def tb_eval_stats(self, step, stats):
|
def tb_eval_stats(self, step, stats):
|
||||||
self.dict_to_tb_scalar("EvalStats", stats, step)
|
self.dict_to_tb_scalar(f"{self.model_name}_EvalStats", stats, step)
|
||||||
|
|
||||||
def tb_eval_figures(self, step, figures):
|
def tb_eval_figures(self, step, figures):
|
||||||
self.dict_to_tb_figure("EvalFigures", figures, step)
|
self.dict_to_tb_figure(f"{self.model_name}_EvalFigures", figures, step)
|
||||||
|
|
||||||
def tb_eval_audios(self, step, audios, sample_rate):
|
def tb_eval_audios(self, step, audios, sample_rate):
|
||||||
self.dict_to_tb_audios("EvalAudios", audios, step, sample_rate)
|
self.dict_to_tb_audios(f"{self.model_name}_EvalAudios", audios, step, sample_rate)
|
||||||
|
|
||||||
def tb_test_audios(self, step, audios, sample_rate):
|
def tb_test_audios(self, step, audios, sample_rate):
|
||||||
self.dict_to_tb_audios("TestAudios", audios, step, sample_rate)
|
self.dict_to_tb_audios(f"{self.model_name}_TestAudios", audios, step, sample_rate)
|
||||||
|
|
||||||
def tb_test_figures(self, step, figures):
|
def tb_test_figures(self, step, figures):
|
||||||
self.dict_to_tb_figure("TestFigures", figures, step)
|
self.dict_to_tb_figure(f"{self.model_name}_TestFigures", figures, step)
|
||||||
|
|
||||||
def tb_add_text(self, title, text, step):
|
def tb_add_text(self, title, text, step):
|
||||||
self.writer.add_text(title, text, step)
|
self.writer.add_text(title, text, step)
|
||||||
|
|
|
@ -77,7 +77,6 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=
|
||||||
_phonemes_to_id = {s: i for i, s in enumerate(_phonemes)}
|
_phonemes_to_id = {s: i for i, s in enumerate(_phonemes)}
|
||||||
|
|
||||||
sequence = []
|
sequence = []
|
||||||
text = text.replace(":", "")
|
|
||||||
clean_text = _clean_text(text, cleaner_names)
|
clean_text = _clean_text(text, cleaner_names)
|
||||||
to_phonemes = text2phone(clean_text, language)
|
to_phonemes = text2phone(clean_text, language)
|
||||||
if to_phonemes is None:
|
if to_phonemes is None:
|
||||||
|
|
|
@ -71,7 +71,7 @@ def remove_aux_symbols(text):
|
||||||
def replace_symbols(text):
|
def replace_symbols(text):
|
||||||
text = text.replace(';', ',')
|
text = text.replace(';', ',')
|
||||||
text = text.replace('-', ' ')
|
text = text.replace('-', ' ')
|
||||||
text = text.replace(':', ' ')
|
text = text.replace(':', ',')
|
||||||
text = text.replace('&', 'and')
|
text = text.replace('&', 'and')
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
|
@ -1,51 +1,18 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
""" from https://github.com/keithito/tacotron """
|
||||||
|
|
||||||
|
import inflect
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
_inflect = inflect.engine()
|
||||||
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
||||||
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
|
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
|
||||||
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
|
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
|
||||||
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
|
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
|
||||||
_ordinal_re = re.compile(r'([0-9]+)(st|nd|rd|th)')
|
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
|
||||||
_number_re = re.compile(r'[0-9]+')
|
_number_re = re.compile(r'[0-9]+')
|
||||||
|
|
||||||
_units = [
|
|
||||||
'', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
|
|
||||||
'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen',
|
|
||||||
'seventeen', 'eighteen', 'nineteen'
|
|
||||||
]
|
|
||||||
|
|
||||||
_tens = [
|
|
||||||
'',
|
|
||||||
'ten',
|
|
||||||
'twenty',
|
|
||||||
'thirty',
|
|
||||||
'forty',
|
|
||||||
'fifty',
|
|
||||||
'sixty',
|
|
||||||
'seventy',
|
|
||||||
'eighty',
|
|
||||||
'ninety',
|
|
||||||
]
|
|
||||||
|
|
||||||
_digit_groups = [
|
|
||||||
'',
|
|
||||||
'thousand',
|
|
||||||
'million',
|
|
||||||
'billion',
|
|
||||||
'trillion',
|
|
||||||
'quadrillion',
|
|
||||||
]
|
|
||||||
|
|
||||||
_ordinal_suffixes = [
|
|
||||||
('one', 'first'),
|
|
||||||
('two', 'second'),
|
|
||||||
('three', 'third'),
|
|
||||||
('five', 'fifth'),
|
|
||||||
('eight', 'eighth'),
|
|
||||||
('nine', 'ninth'),
|
|
||||||
('twelve', 'twelfth'),
|
|
||||||
('ty', 'tieth'),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def _remove_commas(m):
|
def _remove_commas(m):
|
||||||
return m.group(1).replace(',', '')
|
return m.group(1).replace(',', '')
|
||||||
|
@ -66,55 +33,33 @@ def _expand_dollars(m):
|
||||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||||
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
|
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
|
||||||
if dollars:
|
elif dollars:
|
||||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||||
return '%s %s' % (dollars, dollar_unit)
|
return '%s %s' % (dollars, dollar_unit)
|
||||||
if cents:
|
elif cents:
|
||||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||||
return '%s %s' % (cents, cent_unit)
|
return '%s %s' % (cents, cent_unit)
|
||||||
|
else:
|
||||||
return 'zero dollars'
|
return 'zero dollars'
|
||||||
|
|
||||||
|
|
||||||
def _standard_number_to_words(n, digit_group):
|
def _expand_ordinal(m):
|
||||||
parts = []
|
return _inflect.number_to_words(m.group(0))
|
||||||
if n >= 1000:
|
|
||||||
# Format next higher digit group.
|
|
||||||
parts.append(_standard_number_to_words(n // 1000, digit_group + 1))
|
|
||||||
n = n % 1000
|
|
||||||
|
|
||||||
if n >= 100:
|
|
||||||
parts.append('%s hundred' % _units[n // 100])
|
|
||||||
if n % 100 >= len(_units):
|
|
||||||
parts.append(_tens[(n % 100) // 10])
|
|
||||||
parts.append(_units[(n % 100) % 10])
|
|
||||||
else:
|
|
||||||
parts.append(_units[n % 100])
|
|
||||||
if n > 0:
|
|
||||||
parts.append(_digit_groups[digit_group])
|
|
||||||
return ' '.join([x for x in parts if x])
|
|
||||||
|
|
||||||
|
|
||||||
def _number_to_words(n):
|
|
||||||
# Handle special cases first, then go to the standard case:
|
|
||||||
if n >= 1000000000000000000:
|
|
||||||
return str(n) # Too large, just return the digits
|
|
||||||
if n == 0:
|
|
||||||
return 'zero'
|
|
||||||
if n % 100 == 0 and n % 1000 != 0 and n < 3000:
|
|
||||||
return _standard_number_to_words(n // 100, 0) + ' hundred'
|
|
||||||
return _standard_number_to_words(n, 0)
|
|
||||||
|
|
||||||
|
|
||||||
def _expand_number(m):
|
def _expand_number(m):
|
||||||
return _number_to_words(int(m.group(0)))
|
num = int(m.group(0))
|
||||||
|
if num > 1000 and num < 3000:
|
||||||
|
if num == 2000:
|
||||||
def _expand_ordinal(m):
|
return 'two thousand'
|
||||||
num = _number_to_words(int(m.group(1)))
|
elif num > 2000 and num < 2010:
|
||||||
for suffix, replacement in _ordinal_suffixes:
|
return 'two thousand ' + _inflect.number_to_words(num % 100)
|
||||||
if num.endswith(suffix):
|
elif num % 100 == 0:
|
||||||
return num[:-len(suffix)] + replacement
|
return _inflect.number_to_words(num // 100) + ' hundred'
|
||||||
return num + 'th'
|
else:
|
||||||
|
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
|
||||||
|
else:
|
||||||
|
return _inflect.number_to_words(num, andword='')
|
||||||
|
|
||||||
|
|
||||||
def normalize_numbers(text):
|
def normalize_numbers(text):
|
||||||
|
|
|
@ -2,6 +2,17 @@ import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def setup_torch_training_env(cudnn_enable, cudnn_benchmark):
|
||||||
|
torch.backends.cudnn.enabled = cudnn_enable
|
||||||
|
torch.backends.cudnn.benchmark = cudnn_benchmark
|
||||||
|
torch.manual_seed(54321)
|
||||||
|
use_cuda = torch.cuda.is_available()
|
||||||
|
num_gpus = torch.cuda.device_count()
|
||||||
|
print(" > Using CUDA: ", use_cuda)
|
||||||
|
print(" > Number of GPUs: ", num_gpus)
|
||||||
|
return use_cuda, num_gpus
|
||||||
|
|
||||||
|
|
||||||
def check_update(model, grad_clip, ignore_stopnet=False):
|
def check_update(model, grad_clip, ignore_stopnet=False):
|
||||||
r'''Check model gradient against unexpected jumps and failures'''
|
r'''Check model gradient against unexpected jumps and failures'''
|
||||||
skip_flag = False
|
skip_flag = False
|
||||||
|
@ -9,6 +20,12 @@ def check_update(model, grad_clip, ignore_stopnet=False):
|
||||||
grad_norm = torch.nn.utils.clip_grad_norm_([param for name, param in model.named_parameters() if 'stopnet' not in name], grad_clip)
|
grad_norm = torch.nn.utils.clip_grad_norm_([param for name, param in model.named_parameters() if 'stopnet' not in name], grad_clip)
|
||||||
else:
|
else:
|
||||||
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
|
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
|
||||||
|
# compatibility with different torch versions
|
||||||
|
if isinstance(grad_norm, float):
|
||||||
|
if np.isinf(grad_norm):
|
||||||
|
print(" | > Gradient is INF !!")
|
||||||
|
skip_flag = True
|
||||||
|
else:
|
||||||
if torch.isinf(grad_norm):
|
if torch.isinf(grad_norm):
|
||||||
print(" | > Gradient is INF !!")
|
print(" | > Gradient is INF !!")
|
||||||
skip_flag = True
|
skip_flag = True
|
||||||
|
|
|
@ -27,14 +27,15 @@ def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None):
|
||||||
return fig
|
return fig
|
||||||
|
|
||||||
|
|
||||||
def plot_spectrogram(linear_output, audio, fig_size=(16, 10)):
|
def plot_spectrogram(spectrogram, ap=None, fig_size=(16, 10)):
|
||||||
if isinstance(linear_output, torch.Tensor):
|
if isinstance(spectrogram, torch.Tensor):
|
||||||
linear_output_ = linear_output.detach().cpu().numpy().squeeze()
|
spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T
|
||||||
else:
|
else:
|
||||||
linear_output_ = linear_output
|
spectrogram_ = spectrogram.T
|
||||||
spectrogram = audio._denormalize(linear_output_.T) # pylint: disable=protected-access
|
if ap is not None:
|
||||||
|
spectrogram_ = ap._denormalize(spectrogram_) # pylint: disable=protected-access
|
||||||
fig = plt.figure(figsize=fig_size)
|
fig = plt.figure(figsize=fig_size)
|
||||||
plt.imshow(spectrogram, aspect="auto", origin="lower")
|
plt.imshow(spectrogram_, aspect="auto", origin="lower")
|
||||||
plt.colorbar()
|
plt.colorbar()
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
return fig
|
return fig
|
||||||
|
|
|
@ -0,0 +1,38 @@
|
||||||
|
# Mozilla TTS Vocoders (Experimental)
|
||||||
|
|
||||||
|
We provide here different vocoder implementations which can be combined with our TTS models to enable "FASTER THAN REAL-TIME" end-to-end TTS stack.
|
||||||
|
|
||||||
|
Currently, there are implementations of the following models.
|
||||||
|
|
||||||
|
- Melgan
|
||||||
|
- MultiBand-Melgan
|
||||||
|
- GAN-TTS (Discriminator Only)
|
||||||
|
|
||||||
|
It is also very easy to adapt different vocoder models as we provide here a flexible and modular (but not too modular) framework.
|
||||||
|
|
||||||
|
## Training a model
|
||||||
|
|
||||||
|
You can see here an example (Soon)[Colab Notebook]() training MelGAN with LJSpeech dataset.
|
||||||
|
|
||||||
|
In order to train a new model, you need to collecto all your wav files under a common parent folder and give this path to `data_path` field in '''config.json'''
|
||||||
|
|
||||||
|
You need to define other relevant parameters in your ```config.json``` and then start traning with the following command from Mozilla TTS root path.
|
||||||
|
|
||||||
|
```CUDA_VISIBLE_DEVICES='1' python vocoder/train.py --config_path path/to/config.json```
|
||||||
|
|
||||||
|
Exampled config files can be found under `vocoder/configs/` folder.
|
||||||
|
|
||||||
|
You can continue a previous training by the following command.
|
||||||
|
|
||||||
|
```CUDA_VISIBLE_DEVICES='1' python vocoder/train.py --continue_path path/to/your/model/folder```
|
||||||
|
|
||||||
|
You can fine-tune a pre-trained model by the following command.
|
||||||
|
|
||||||
|
```CUDA_VISIBLE_DEVICES='1' python vocoder/train.py --restore_path path/to/your/model.pth.tar```
|
||||||
|
|
||||||
|
Restoring a model starts a new training in a different output folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same conditions the previous training run left off.
|
||||||
|
|
||||||
|
You can also follow your training runs on Tensorboard as you do with our TTS models.
|
||||||
|
|
||||||
|
## Acknowledgement
|
||||||
|
Thanks to @kan-bayashi for his [repository](https://github.com/kan-bayashi/ParallelWaveGAN) being the start point of our work.
|
|
@ -0,0 +1,151 @@
|
||||||
|
{
|
||||||
|
"run_name": "multiband-melgan-rwd",
|
||||||
|
"run_description": "multiband melgan with random window discriminator from https://arxiv.org/pdf/1909.11646.pdf",
|
||||||
|
|
||||||
|
// AUDIO PARAMETERS
|
||||||
|
"audio":{
|
||||||
|
// stft parameters
|
||||||
|
"num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||||
|
"win_length": 1024, // stft window length in ms.
|
||||||
|
"hop_length": 256, // stft window hop-lengh in ms.
|
||||||
|
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
||||||
|
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
||||||
|
|
||||||
|
// Audio processing parameters
|
||||||
|
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
||||||
|
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||||
|
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||||
|
|
||||||
|
// Silence trimming
|
||||||
|
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
||||||
|
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
|
||||||
|
|
||||||
|
// Griffin-Lim
|
||||||
|
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
||||||
|
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
||||||
|
|
||||||
|
// MelSpectrogram parameters
|
||||||
|
"num_mels": 80, // size of the mel spec frame.
|
||||||
|
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||||
|
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||||
|
|
||||||
|
// Normalization parameters
|
||||||
|
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
|
||||||
|
"min_level_db": -100, // lower bound for normalization
|
||||||
|
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||||
|
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||||
|
"clip_norm": true, // clip normalized values into the range.
|
||||||
|
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
||||||
|
},
|
||||||
|
|
||||||
|
// DISTRIBUTED TRAINING
|
||||||
|
// "distributed":{
|
||||||
|
// "backend": "nccl",
|
||||||
|
// "url": "tcp:\/\/localhost:54321"
|
||||||
|
// },
|
||||||
|
|
||||||
|
// MODEL PARAMETERS
|
||||||
|
"use_pqmf": true,
|
||||||
|
|
||||||
|
// LOSS PARAMETERS
|
||||||
|
"use_stft_loss": true,
|
||||||
|
"use_subband_stft_loss": true,
|
||||||
|
"use_mse_gan_loss": true,
|
||||||
|
"use_hinge_gan_loss": false,
|
||||||
|
"use_feat_match_loss": false, // use only with melgan discriminators
|
||||||
|
|
||||||
|
// loss weights
|
||||||
|
"stft_loss_weight": 0.5,
|
||||||
|
"subband_stft_loss_weight": 0.5,
|
||||||
|
"mse_G_loss_weight": 2.5,
|
||||||
|
"hinge_G_loss_weight": 2.5,
|
||||||
|
"feat_match_loss_weight": 25,
|
||||||
|
|
||||||
|
// multiscale stft loss parameters
|
||||||
|
"stft_loss_params": {
|
||||||
|
"n_ffts": [1024, 2048, 512],
|
||||||
|
"hop_lengths": [120, 240, 50],
|
||||||
|
"win_lengths": [600, 1200, 240]
|
||||||
|
},
|
||||||
|
|
||||||
|
// subband multiscale stft loss parameters
|
||||||
|
"subband_stft_loss_params":{
|
||||||
|
"n_ffts": [384, 683, 171],
|
||||||
|
"hop_lengths": [30, 60, 10],
|
||||||
|
"win_lengths": [150, 300, 60]
|
||||||
|
},
|
||||||
|
|
||||||
|
"target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch
|
||||||
|
|
||||||
|
// DISCRIMINATOR
|
||||||
|
"discriminator_model": "random_window_discriminator",
|
||||||
|
"discriminator_model_params":{
|
||||||
|
"uncond_disc_donwsample_factors": [8, 4],
|
||||||
|
"cond_disc_downsample_factors": [[8, 4, 2, 2, 2], [8, 4, 2, 2], [8, 4, 2], [8, 4], [4, 2, 2]],
|
||||||
|
"cond_disc_out_channels": [[128, 128, 256, 256], [128, 256, 256], [128, 256], [256], [128, 256]],
|
||||||
|
"window_sizes": [512, 1024, 2048, 4096, 8192]
|
||||||
|
},
|
||||||
|
"steps_to_start_discriminator": 200000, // steps required to start GAN trainining.1
|
||||||
|
|
||||||
|
// GENERATOR
|
||||||
|
"generator_model": "multiband_melgan_generator",
|
||||||
|
"generator_model_params": {
|
||||||
|
"upsample_factors":[8, 4, 2],
|
||||||
|
"num_res_blocks": 4
|
||||||
|
},
|
||||||
|
|
||||||
|
// DATASET
|
||||||
|
"data_path": "/home/erogol/Data/LJSpeech-1.1/wavs/",
|
||||||
|
"seq_len": 16384,
|
||||||
|
"pad_short": 2000,
|
||||||
|
"conv_pad": 0,
|
||||||
|
"use_noise_augment": false,
|
||||||
|
"use_cache": true,
|
||||||
|
|
||||||
|
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
||||||
|
|
||||||
|
// TRAINING
|
||||||
|
"batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||||
|
|
||||||
|
// VALIDATION
|
||||||
|
"run_eval": true,
|
||||||
|
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
|
||||||
|
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
||||||
|
|
||||||
|
// OPTIMIZER
|
||||||
|
"noam_schedule": false, // use noam warmup and lr schedule.
|
||||||
|
"warmup_steps_gen": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||||
|
"warmup_steps_disc": 4000,
|
||||||
|
"epochs": 10000, // total number of epochs to train.
|
||||||
|
"wd": 0.0, // Weight decay weight.
|
||||||
|
"gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0
|
||||||
|
"disc_clip_grad": -1, // Discriminator gradient clipping threshold.
|
||||||
|
"lr_scheduler_gen": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
||||||
|
"lr_scheduler_gen_params": {
|
||||||
|
"gamma": 0.5,
|
||||||
|
"milestones": [100000, 200000, 300000, 400000, 500000, 600000]
|
||||||
|
},
|
||||||
|
"lr_scheduler_disc": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
||||||
|
"lr_scheduler_disc_params": {
|
||||||
|
"gamma": 0.5,
|
||||||
|
"milestones": [100000, 200000, 300000, 400000, 500000, 600000]
|
||||||
|
},
|
||||||
|
"lr_gen": 1e-4, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||||
|
"lr_disc": 1e-4,
|
||||||
|
|
||||||
|
// TENSORBOARD and LOGGING
|
||||||
|
"print_step": 25, // Number of steps to log traning on console.
|
||||||
|
"print_eval": false, // If True, it prints loss values for each step in eval run.
|
||||||
|
"save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints.
|
||||||
|
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||||
|
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||||
|
|
||||||
|
// DATA LOADING
|
||||||
|
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||||
|
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
||||||
|
"eval_split_size": 10,
|
||||||
|
|
||||||
|
// PATHS
|
||||||
|
"output_path": "/home/erogol/Models/LJSpeech/"
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,144 @@
|
||||||
|
{
|
||||||
|
"run_name": "multiband-melgan",
|
||||||
|
"run_description": "multiband melgan mean-var scaling",
|
||||||
|
|
||||||
|
// AUDIO PARAMETERS
|
||||||
|
"audio":{
|
||||||
|
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||||
|
"win_length": 1024, // stft window length in ms.
|
||||||
|
"hop_length": 256, // stft window hop-lengh in ms.
|
||||||
|
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
||||||
|
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
||||||
|
|
||||||
|
// Audio processing parameters
|
||||||
|
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
||||||
|
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||||
|
"ref_level_db": 0, // reference level db, theoretically 20db is the sound of air.
|
||||||
|
|
||||||
|
// Silence trimming
|
||||||
|
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
||||||
|
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
|
||||||
|
|
||||||
|
// MelSpectrogram parameters
|
||||||
|
"num_mels": 80, // size of the mel spec frame.
|
||||||
|
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||||
|
"mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||||
|
"spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.
|
||||||
|
|
||||||
|
// Normalization parameters
|
||||||
|
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
|
||||||
|
"min_level_db": -100, // lower bound for normalization
|
||||||
|
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||||
|
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||||
|
"clip_norm": true, // clip normalized values into the range.
|
||||||
|
"stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
||||||
|
},
|
||||||
|
|
||||||
|
// DISTRIBUTED TRAINING
|
||||||
|
// "distributed":{
|
||||||
|
// "backend": "nccl",
|
||||||
|
// "url": "tcp:\/\/localhost:54321"
|
||||||
|
// },
|
||||||
|
|
||||||
|
// MODEL PARAMETERS
|
||||||
|
"use_pqmf": true,
|
||||||
|
|
||||||
|
// LOSS PARAMETERS
|
||||||
|
"use_stft_loss": true,
|
||||||
|
"use_subband_stft_loss": true,
|
||||||
|
"use_mse_gan_loss": true,
|
||||||
|
"use_hinge_gan_loss": false,
|
||||||
|
"use_feat_match_loss": false, // use only with melgan discriminators
|
||||||
|
|
||||||
|
// loss weights
|
||||||
|
"stft_loss_weight": 0.5,
|
||||||
|
"subband_stft_loss_weight": 0.5,
|
||||||
|
"mse_G_loss_weight": 2.5,
|
||||||
|
"hinge_G_loss_weight": 2.5,
|
||||||
|
"feat_match_loss_weight": 25,
|
||||||
|
|
||||||
|
// multiscale stft loss parameters
|
||||||
|
"stft_loss_params": {
|
||||||
|
"n_ffts": [1024, 2048, 512],
|
||||||
|
"hop_lengths": [120, 240, 50],
|
||||||
|
"win_lengths": [600, 1200, 240]
|
||||||
|
},
|
||||||
|
|
||||||
|
// subband multiscale stft loss parameters
|
||||||
|
"subband_stft_loss_params":{
|
||||||
|
"n_ffts": [384, 683, 171],
|
||||||
|
"hop_lengths": [30, 60, 10],
|
||||||
|
"win_lengths": [150, 300, 60]
|
||||||
|
},
|
||||||
|
|
||||||
|
"target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch
|
||||||
|
|
||||||
|
// DISCRIMINATOR
|
||||||
|
"discriminator_model": "melgan_multiscale_discriminator",
|
||||||
|
"discriminator_model_params":{
|
||||||
|
"base_channels": 16,
|
||||||
|
"max_channels":512,
|
||||||
|
"downsample_factors":[4, 4, 4]
|
||||||
|
},
|
||||||
|
"steps_to_start_discriminator": 200000, // steps required to start GAN trainining.1
|
||||||
|
|
||||||
|
// GENERATOR
|
||||||
|
"generator_model": "multiband_melgan_generator",
|
||||||
|
"generator_model_params": {
|
||||||
|
"upsample_factors":[8, 4, 2],
|
||||||
|
"num_res_blocks": 4
|
||||||
|
},
|
||||||
|
|
||||||
|
// DATASET
|
||||||
|
"data_path": "/home/erogol/Data/LJSpeech-1.1/wavs/",
|
||||||
|
"feature_path": null,
|
||||||
|
"seq_len": 16384,
|
||||||
|
"pad_short": 2000,
|
||||||
|
"conv_pad": 0,
|
||||||
|
"use_noise_augment": false,
|
||||||
|
"use_cache": true,
|
||||||
|
|
||||||
|
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
||||||
|
|
||||||
|
// TRAINING
|
||||||
|
"batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||||
|
|
||||||
|
// VALIDATION
|
||||||
|
"run_eval": true,
|
||||||
|
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
|
||||||
|
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
||||||
|
|
||||||
|
// OPTIMIZER
|
||||||
|
"epochs": 10000, // total number of epochs to train.
|
||||||
|
"wd": 0.0, // Weight decay weight.
|
||||||
|
"gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0
|
||||||
|
"disc_clip_grad": -1, // Discriminator gradient clipping threshold.
|
||||||
|
"lr_scheduler_gen": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
||||||
|
"lr_scheduler_gen_params": {
|
||||||
|
"gamma": 0.5,
|
||||||
|
"milestones": [100000, 200000, 300000, 400000, 500000, 600000]
|
||||||
|
},
|
||||||
|
"lr_scheduler_disc": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
||||||
|
"lr_scheduler_disc_params": {
|
||||||
|
"gamma": 0.5,
|
||||||
|
"milestones": [100000, 200000, 300000, 400000, 500000, 600000]
|
||||||
|
},
|
||||||
|
"lr_gen": 1e-4, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||||
|
"lr_disc": 1e-4,
|
||||||
|
|
||||||
|
// TENSORBOARD and LOGGING
|
||||||
|
"print_step": 25, // Number of steps to log traning on console.
|
||||||
|
"print_eval": false, // If True, it prints loss values for each step in eval run.
|
||||||
|
"save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints.
|
||||||
|
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||||
|
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||||
|
|
||||||
|
// DATA LOADING
|
||||||
|
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||||
|
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
||||||
|
"eval_split_size": 10,
|
||||||
|
|
||||||
|
// PATHS
|
||||||
|
"output_path": "/home/erogol/Models/LJSpeech/"
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,126 @@
|
||||||
|
import os
|
||||||
|
import glob
|
||||||
|
import torch
|
||||||
|
import random
|
||||||
|
import numpy as np
|
||||||
|
from torch.utils.data import Dataset
|
||||||
|
from multiprocessing import Manager
|
||||||
|
|
||||||
|
|
||||||
|
class GANDataset(Dataset):
|
||||||
|
"""
|
||||||
|
GAN Dataset searchs for all the wav files under root path
|
||||||
|
and converts them to acoustic features on the fly and returns
|
||||||
|
random segments of (audio, feature) couples.
|
||||||
|
"""
|
||||||
|
def __init__(self,
|
||||||
|
ap,
|
||||||
|
items,
|
||||||
|
seq_len,
|
||||||
|
hop_len,
|
||||||
|
pad_short,
|
||||||
|
conv_pad=2,
|
||||||
|
is_training=True,
|
||||||
|
return_segments=True,
|
||||||
|
use_noise_augment=False,
|
||||||
|
use_cache=False,
|
||||||
|
verbose=False):
|
||||||
|
|
||||||
|
self.ap = ap
|
||||||
|
self.item_list = items
|
||||||
|
self.compute_feat = not isinstance(items[0], (tuple, list))
|
||||||
|
self.seq_len = seq_len
|
||||||
|
self.hop_len = hop_len
|
||||||
|
self.pad_short = pad_short
|
||||||
|
self.conv_pad = conv_pad
|
||||||
|
self.is_training = is_training
|
||||||
|
self.return_segments = return_segments
|
||||||
|
self.use_cache = use_cache
|
||||||
|
self.use_noise_augment = use_noise_augment
|
||||||
|
self.verbose = verbose
|
||||||
|
|
||||||
|
assert seq_len % hop_len == 0, " [!] seq_len has to be a multiple of hop_len."
|
||||||
|
self.feat_frame_len = seq_len // hop_len + (2 * conv_pad)
|
||||||
|
|
||||||
|
# map G and D instances
|
||||||
|
self.G_to_D_mappings = list(range(len(self.item_list)))
|
||||||
|
self.shuffle_mapping()
|
||||||
|
|
||||||
|
# cache acoustic features
|
||||||
|
if use_cache:
|
||||||
|
self.create_feature_cache()
|
||||||
|
|
||||||
|
def create_feature_cache(self):
|
||||||
|
self.manager = Manager()
|
||||||
|
self.cache = self.manager.list()
|
||||||
|
self.cache += [None for _ in range(len(self.item_list))]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def find_wav_files(path):
|
||||||
|
return glob.glob(os.path.join(path, '**', '*.wav'), recursive=True)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.item_list)
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
""" Return different items for Generator and Discriminator and
|
||||||
|
cache acoustic features """
|
||||||
|
if self.return_segments:
|
||||||
|
idx2 = self.G_to_D_mappings[idx]
|
||||||
|
item1 = self.load_item(idx)
|
||||||
|
item2 = self.load_item(idx2)
|
||||||
|
return item1, item2
|
||||||
|
item1 = self.load_item(idx)
|
||||||
|
return item1
|
||||||
|
|
||||||
|
def shuffle_mapping(self):
|
||||||
|
random.shuffle(self.G_to_D_mappings)
|
||||||
|
|
||||||
|
def load_item(self, idx):
|
||||||
|
""" load (audio, feat) couple """
|
||||||
|
if self.compute_feat:
|
||||||
|
# compute features from wav
|
||||||
|
wavpath = self.item_list[idx]
|
||||||
|
# print(wavpath)
|
||||||
|
|
||||||
|
if self.use_cache and self.cache[idx] is not None:
|
||||||
|
audio, mel = self.cache[idx]
|
||||||
|
else:
|
||||||
|
audio = self.ap.load_wav(wavpath)
|
||||||
|
mel = self.ap.melspectrogram(audio)
|
||||||
|
else:
|
||||||
|
|
||||||
|
# load precomputed features
|
||||||
|
wavpath, feat_path = self.item_list[idx]
|
||||||
|
|
||||||
|
if self.use_cache and self.cache[idx] is not None:
|
||||||
|
audio, mel = self.cache[idx]
|
||||||
|
else:
|
||||||
|
audio = self.ap.load_wav(wavpath)
|
||||||
|
mel = np.load(feat_path)
|
||||||
|
|
||||||
|
if len(audio) < self.seq_len + self.pad_short:
|
||||||
|
audio = np.pad(audio, (0, self.seq_len + self.pad_short - len(audio)), \
|
||||||
|
mode='constant', constant_values=0.0)
|
||||||
|
|
||||||
|
# correct the audio length wrt padding applied in stft
|
||||||
|
audio = np.pad(audio, (0, self.hop_len), mode="edge")
|
||||||
|
audio = audio[:mel.shape[-1] * self.hop_len]
|
||||||
|
assert mel.shape[-1] * self.hop_len == audio.shape[-1], f' [!] {mel.shape[-1] * self.hop_len} vs {audio.shape[-1]}'
|
||||||
|
|
||||||
|
audio = torch.from_numpy(audio).float().unsqueeze(0)
|
||||||
|
mel = torch.from_numpy(mel).float().squeeze(0)
|
||||||
|
|
||||||
|
if self.return_segments:
|
||||||
|
max_mel_start = mel.shape[1] - self.feat_frame_len
|
||||||
|
mel_start = random.randint(0, max_mel_start)
|
||||||
|
mel_end = mel_start + self.feat_frame_len
|
||||||
|
mel = mel[:, mel_start:mel_end]
|
||||||
|
|
||||||
|
audio_start = mel_start * self.hop_len
|
||||||
|
audio = audio[:, audio_start:audio_start +
|
||||||
|
self.seq_len]
|
||||||
|
|
||||||
|
if self.use_noise_augment and self.is_training and self.return_segments:
|
||||||
|
audio = audio + (1 / 32768) * torch.randn_like(audio)
|
||||||
|
return (mel, audio)
|
|
@ -0,0 +1,37 @@
|
||||||
|
import glob
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def find_wav_files(data_path):
|
||||||
|
wav_paths = glob.glob(os.path.join(data_path, '**', '*.wav'), recursive=True)
|
||||||
|
return wav_paths
|
||||||
|
|
||||||
|
|
||||||
|
def find_feat_files(data_path):
|
||||||
|
feat_paths = glob.glob(os.path.join(data_path, '**', '*.npy'), recursive=True)
|
||||||
|
return feat_paths
|
||||||
|
|
||||||
|
|
||||||
|
def load_wav_data(data_path, eval_split_size):
|
||||||
|
wav_paths = find_wav_files(data_path)
|
||||||
|
np.random.seed(0)
|
||||||
|
np.random.shuffle(wav_paths)
|
||||||
|
return wav_paths[:eval_split_size], wav_paths[eval_split_size:]
|
||||||
|
|
||||||
|
|
||||||
|
def load_wav_feat_data(data_path, feat_path, eval_split_size):
|
||||||
|
wav_paths = sorted(find_wav_files(data_path))
|
||||||
|
feat_paths = sorted(find_feat_files(feat_path))
|
||||||
|
assert len(wav_paths) == len(feat_paths)
|
||||||
|
for wav, feat in zip(wav_paths, feat_paths):
|
||||||
|
wav_name = Path(wav).stem
|
||||||
|
feat_name = Path(feat).stem
|
||||||
|
assert wav_name == feat_name
|
||||||
|
|
||||||
|
items = list(zip(wav_paths, feat_paths))
|
||||||
|
np.random.seed(0)
|
||||||
|
np.random.shuffle(items)
|
||||||
|
return items[:eval_split_size], items[eval_split_size:]
|
|
@ -0,0 +1,309 @@
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from torch import nn
|
||||||
|
from torch.nn import functional as F
|
||||||
|
|
||||||
|
|
||||||
|
class TorchSTFT():
|
||||||
|
def __init__(self, n_fft, hop_length, win_length, window='hann_window'):
|
||||||
|
""" Torch based STFT operation """
|
||||||
|
self.n_fft = n_fft
|
||||||
|
self.hop_length = hop_length
|
||||||
|
self.win_length = win_length
|
||||||
|
self.window = getattr(torch, window)(win_length)
|
||||||
|
|
||||||
|
def __call__(self, x):
|
||||||
|
# B x D x T x 2
|
||||||
|
o = torch.stft(x,
|
||||||
|
self.n_fft,
|
||||||
|
self.hop_length,
|
||||||
|
self.win_length,
|
||||||
|
self.window,
|
||||||
|
center=True,
|
||||||
|
pad_mode="reflect", # compatible with audio.py
|
||||||
|
normalized=False,
|
||||||
|
onesided=True)
|
||||||
|
M = o[:, :, :, 0]
|
||||||
|
P = o[:, :, :, 1]
|
||||||
|
return torch.sqrt(torch.clamp(M ** 2 + P ** 2, min=1e-8))
|
||||||
|
|
||||||
|
|
||||||
|
#################################
|
||||||
|
# GENERATOR LOSSES
|
||||||
|
#################################
|
||||||
|
|
||||||
|
|
||||||
|
class STFTLoss(nn.Module):
|
||||||
|
""" Single scale STFT Loss """
|
||||||
|
def __init__(self, n_fft, hop_length, win_length):
|
||||||
|
super(STFTLoss, self).__init__()
|
||||||
|
self.n_fft = n_fft
|
||||||
|
self.hop_length = hop_length
|
||||||
|
self.win_length = win_length
|
||||||
|
self.stft = TorchSTFT(n_fft, hop_length, win_length)
|
||||||
|
|
||||||
|
def forward(self, y_hat, y):
|
||||||
|
y_hat_M = self.stft(y_hat)
|
||||||
|
y_M = self.stft(y)
|
||||||
|
# magnitude loss
|
||||||
|
loss_mag = F.l1_loss(torch.log(y_M), torch.log(y_hat_M))
|
||||||
|
# spectral convergence loss
|
||||||
|
loss_sc = torch.norm(y_M - y_hat_M, p="fro") / torch.norm(y_M, p="fro")
|
||||||
|
return loss_mag, loss_sc
|
||||||
|
|
||||||
|
class MultiScaleSTFTLoss(torch.nn.Module):
|
||||||
|
""" Multi scale STFT loss """
|
||||||
|
def __init__(self,
|
||||||
|
n_ffts=(1024, 2048, 512),
|
||||||
|
hop_lengths=(120, 240, 50),
|
||||||
|
win_lengths=(600, 1200, 240)):
|
||||||
|
super(MultiScaleSTFTLoss, self).__init__()
|
||||||
|
self.loss_funcs = torch.nn.ModuleList()
|
||||||
|
for n_fft, hop_length, win_length in zip(n_ffts, hop_lengths, win_lengths):
|
||||||
|
self.loss_funcs.append(STFTLoss(n_fft, hop_length, win_length))
|
||||||
|
|
||||||
|
def forward(self, y_hat, y):
|
||||||
|
N = len(self.loss_funcs)
|
||||||
|
loss_sc = 0
|
||||||
|
loss_mag = 0
|
||||||
|
for f in self.loss_funcs:
|
||||||
|
lm, lsc = f(y_hat, y)
|
||||||
|
loss_mag += lm
|
||||||
|
loss_sc += lsc
|
||||||
|
loss_sc /= N
|
||||||
|
loss_mag /= N
|
||||||
|
return loss_mag, loss_sc
|
||||||
|
|
||||||
|
|
||||||
|
class MultiScaleSubbandSTFTLoss(MultiScaleSTFTLoss):
|
||||||
|
""" Multiscale STFT loss for multi band model outputs """
|
||||||
|
# pylint: disable=no-self-use
|
||||||
|
def forward(self, y_hat, y):
|
||||||
|
y_hat = y_hat.view(-1, 1, y_hat.shape[2])
|
||||||
|
y = y.view(-1, 1, y.shape[2])
|
||||||
|
return super().forward(y_hat.squeeze(1), y.squeeze(1))
|
||||||
|
|
||||||
|
|
||||||
|
class MSEGLoss(nn.Module):
|
||||||
|
""" Mean Squared Generator Loss """
|
||||||
|
# pylint: disable=no-self-use
|
||||||
|
def forward(self, score_real):
|
||||||
|
loss_fake = F.mse_loss(score_real, score_real.new_ones(score_real.shape))
|
||||||
|
return loss_fake
|
||||||
|
|
||||||
|
|
||||||
|
class HingeGLoss(nn.Module):
|
||||||
|
""" Hinge Discriminator Loss """
|
||||||
|
# pylint: disable=no-self-use
|
||||||
|
def forward(self, score_real):
|
||||||
|
# TODO: this might be wrong
|
||||||
|
loss_fake = torch.mean(F.relu(1. - score_real))
|
||||||
|
return loss_fake
|
||||||
|
|
||||||
|
|
||||||
|
##################################
|
||||||
|
# DISCRIMINATOR LOSSES
|
||||||
|
##################################
|
||||||
|
|
||||||
|
|
||||||
|
class MSEDLoss(nn.Module):
|
||||||
|
""" Mean Squared Discriminator Loss """
|
||||||
|
def __init__(self,):
|
||||||
|
super(MSEDLoss, self).__init__()
|
||||||
|
self.loss_func = nn.MSELoss()
|
||||||
|
|
||||||
|
# pylint: disable=no-self-use
|
||||||
|
def forward(self, score_fake, score_real):
|
||||||
|
loss_real = self.loss_func(score_real, score_real.new_ones(score_real.shape))
|
||||||
|
loss_fake = self.loss_func(score_fake, score_fake.new_zeros(score_fake.shape))
|
||||||
|
loss_d = loss_real + loss_fake
|
||||||
|
return loss_d, loss_real, loss_fake
|
||||||
|
|
||||||
|
|
||||||
|
class HingeDLoss(nn.Module):
|
||||||
|
""" Hinge Discriminator Loss """
|
||||||
|
# pylint: disable=no-self-use
|
||||||
|
def forward(self, score_fake, score_real):
|
||||||
|
loss_real = torch.mean(F.relu(1. - score_real))
|
||||||
|
loss_fake = torch.mean(F.relu(1. + score_fake))
|
||||||
|
loss_d = loss_real + loss_fake
|
||||||
|
return loss_d, loss_real, loss_fake
|
||||||
|
|
||||||
|
|
||||||
|
class MelganFeatureLoss(nn.Module):
|
||||||
|
def __init__(self,):
|
||||||
|
super(MelganFeatureLoss, self).__init__()
|
||||||
|
self.loss_func = nn.L1Loss()
|
||||||
|
|
||||||
|
# pylint: disable=no-self-use
|
||||||
|
def forward(self, fake_feats, real_feats):
|
||||||
|
loss_feats = 0
|
||||||
|
for fake_feat, real_feat in zip(fake_feats, real_feats):
|
||||||
|
loss_feats += self.loss_func(fake_feat, real_feat)
|
||||||
|
loss_feats /= len(fake_feats) + len(real_feats)
|
||||||
|
return loss_feats
|
||||||
|
|
||||||
|
|
||||||
|
#####################################
|
||||||
|
# LOSS WRAPPERS
|
||||||
|
#####################################
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_G_adv_loss(scores_fake, loss_func):
|
||||||
|
""" Compute G adversarial loss function
|
||||||
|
and normalize values """
|
||||||
|
adv_loss = 0
|
||||||
|
if isinstance(scores_fake, list):
|
||||||
|
for score_fake in scores_fake:
|
||||||
|
fake_loss = loss_func(score_fake)
|
||||||
|
adv_loss += fake_loss
|
||||||
|
adv_loss /= len(scores_fake)
|
||||||
|
else:
|
||||||
|
fake_loss = loss_func(scores_fake)
|
||||||
|
adv_loss = fake_loss
|
||||||
|
return adv_loss
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_D_loss(scores_fake, scores_real, loss_func):
|
||||||
|
""" Compute D loss func and normalize loss values """
|
||||||
|
loss = 0
|
||||||
|
real_loss = 0
|
||||||
|
fake_loss = 0
|
||||||
|
if isinstance(scores_fake, list):
|
||||||
|
# multi-scale loss
|
||||||
|
for score_fake, score_real in zip(scores_fake, scores_real):
|
||||||
|
total_loss, real_loss, fake_loss = loss_func(score_fake=score_fake, score_real=score_real)
|
||||||
|
loss += total_loss
|
||||||
|
real_loss += real_loss
|
||||||
|
fake_loss += fake_loss
|
||||||
|
# normalize loss values with number of scales
|
||||||
|
loss /= len(scores_fake)
|
||||||
|
real_loss /= len(scores_real)
|
||||||
|
fake_loss /= len(scores_fake)
|
||||||
|
else:
|
||||||
|
# single scale loss
|
||||||
|
total_loss, real_loss, fake_loss = loss_func(scores_fake, scores_real)
|
||||||
|
loss = total_loss
|
||||||
|
return loss, real_loss, fake_loss
|
||||||
|
|
||||||
|
|
||||||
|
##################################
|
||||||
|
# MODEL LOSSES
|
||||||
|
##################################
|
||||||
|
|
||||||
|
|
||||||
|
class GeneratorLoss(nn.Module):
|
||||||
|
def __init__(self, C):
|
||||||
|
""" Compute Generator Loss values depending on training
|
||||||
|
configuration """
|
||||||
|
super(GeneratorLoss, self).__init__()
|
||||||
|
assert not(C.use_mse_gan_loss and C.use_hinge_gan_loss),\
|
||||||
|
" [!] Cannot use HingeGANLoss and MSEGANLoss together."
|
||||||
|
|
||||||
|
self.use_stft_loss = C.use_stft_loss
|
||||||
|
self.use_subband_stft_loss = C.use_subband_stft_loss
|
||||||
|
self.use_mse_gan_loss = C.use_mse_gan_loss
|
||||||
|
self.use_hinge_gan_loss = C.use_hinge_gan_loss
|
||||||
|
self.use_feat_match_loss = C.use_feat_match_loss
|
||||||
|
|
||||||
|
self.stft_loss_weight = C.stft_loss_weight
|
||||||
|
self.subband_stft_loss_weight = C.subband_stft_loss_weight
|
||||||
|
self.mse_gan_loss_weight = C.mse_G_loss_weight
|
||||||
|
self.hinge_gan_loss_weight = C.hinge_G_loss_weight
|
||||||
|
self.feat_match_loss_weight = C.feat_match_loss_weight
|
||||||
|
|
||||||
|
if C.use_stft_loss:
|
||||||
|
self.stft_loss = MultiScaleSTFTLoss(**C.stft_loss_params)
|
||||||
|
if C.use_subband_stft_loss:
|
||||||
|
self.subband_stft_loss = MultiScaleSubbandSTFTLoss(**C.subband_stft_loss_params)
|
||||||
|
if C.use_mse_gan_loss:
|
||||||
|
self.mse_loss = MSEGLoss()
|
||||||
|
if C.use_hinge_gan_loss:
|
||||||
|
self.hinge_loss = HingeGLoss()
|
||||||
|
if C.use_feat_match_loss:
|
||||||
|
self.feat_match_loss = MelganFeatureLoss()
|
||||||
|
|
||||||
|
def forward(self, y_hat=None, y=None, scores_fake=None, feats_fake=None, feats_real=None, y_hat_sub=None, y_sub=None):
|
||||||
|
gen_loss = 0
|
||||||
|
adv_loss = 0
|
||||||
|
return_dict = {}
|
||||||
|
|
||||||
|
# STFT Loss
|
||||||
|
if self.use_stft_loss:
|
||||||
|
stft_loss_mg, stft_loss_sc = self.stft_loss(y_hat.squeeze(1), y.squeeze(1))
|
||||||
|
return_dict['G_stft_loss_mg'] = stft_loss_mg
|
||||||
|
return_dict['G_stft_loss_sc'] = stft_loss_sc
|
||||||
|
gen_loss += self.stft_loss_weight * (stft_loss_mg + stft_loss_sc)
|
||||||
|
|
||||||
|
# subband STFT Loss
|
||||||
|
if self.use_subband_stft_loss:
|
||||||
|
subband_stft_loss_mg, subband_stft_loss_sc = self.subband_stft_loss(y_hat_sub, y_sub)
|
||||||
|
return_dict['G_subband_stft_loss_mg'] = subband_stft_loss_mg
|
||||||
|
return_dict['G_subband_stft_loss_sc'] = subband_stft_loss_sc
|
||||||
|
gen_loss += self.subband_stft_loss_weight * (subband_stft_loss_mg + subband_stft_loss_sc)
|
||||||
|
|
||||||
|
# multiscale MSE adversarial loss
|
||||||
|
if self.use_mse_gan_loss and scores_fake is not None:
|
||||||
|
mse_fake_loss = _apply_G_adv_loss(scores_fake, self.mse_loss)
|
||||||
|
return_dict['G_mse_fake_loss'] = mse_fake_loss
|
||||||
|
adv_loss += self.mse_gan_loss_weight * mse_fake_loss
|
||||||
|
|
||||||
|
# multiscale Hinge adversarial loss
|
||||||
|
if self.use_hinge_gan_loss and not scores_fake is not None:
|
||||||
|
hinge_fake_loss = _apply_G_adv_loss(scores_fake, self.hinge_loss)
|
||||||
|
return_dict['G_hinge_fake_loss'] = hinge_fake_loss
|
||||||
|
adv_loss += self.hinge_gan_loss_weight * hinge_fake_loss
|
||||||
|
|
||||||
|
# Feature Matching Loss
|
||||||
|
if self.use_feat_match_loss and not feats_fake:
|
||||||
|
feat_match_loss = self.feat_match_loss(feats_fake, feats_real)
|
||||||
|
return_dict['G_feat_match_loss'] = feat_match_loss
|
||||||
|
adv_loss += self.feat_match_loss_weight * feat_match_loss
|
||||||
|
return_dict['G_loss'] = gen_loss + adv_loss
|
||||||
|
return_dict['G_gen_loss'] = gen_loss
|
||||||
|
return_dict['G_adv_loss'] = adv_loss
|
||||||
|
return return_dict
|
||||||
|
|
||||||
|
|
||||||
|
class DiscriminatorLoss(nn.Module):
|
||||||
|
""" Compute Discriminator Loss values depending on training
|
||||||
|
configuration """
|
||||||
|
def __init__(self, C):
|
||||||
|
super(DiscriminatorLoss, self).__init__()
|
||||||
|
assert not(C.use_mse_gan_loss and C.use_hinge_gan_loss),\
|
||||||
|
" [!] Cannot use HingeGANLoss and MSEGANLoss together."
|
||||||
|
|
||||||
|
self.use_mse_gan_loss = C.use_mse_gan_loss
|
||||||
|
self.use_hinge_gan_loss = C.use_hinge_gan_loss
|
||||||
|
|
||||||
|
if C.use_mse_gan_loss:
|
||||||
|
self.mse_loss = MSEDLoss()
|
||||||
|
if C.use_hinge_gan_loss:
|
||||||
|
self.hinge_loss = HingeDLoss()
|
||||||
|
|
||||||
|
def forward(self, scores_fake, scores_real):
|
||||||
|
loss = 0
|
||||||
|
return_dict = {}
|
||||||
|
|
||||||
|
if self.use_mse_gan_loss:
|
||||||
|
mse_D_loss, mse_D_real_loss, mse_D_fake_loss = _apply_D_loss(
|
||||||
|
scores_fake=scores_fake,
|
||||||
|
scores_real=scores_real,
|
||||||
|
loss_func=self.mse_loss)
|
||||||
|
return_dict['D_mse_gan_loss'] = mse_D_loss
|
||||||
|
return_dict['D_mse_gan_real_loss'] = mse_D_real_loss
|
||||||
|
return_dict['D_mse_gan_fake_loss'] = mse_D_fake_loss
|
||||||
|
loss += mse_D_loss
|
||||||
|
|
||||||
|
if self.use_hinge_gan_loss:
|
||||||
|
hinge_D_loss, hinge_D_real_loss, hinge_D_fake_loss = _apply_D_loss(
|
||||||
|
scores_fake=scores_fake,
|
||||||
|
scores_real=scores_real,
|
||||||
|
loss_func=self.hinge_loss)
|
||||||
|
return_dict['D_hinge_gan_loss'] = hinge_D_loss
|
||||||
|
return_dict['D_hinge_gan_real_loss'] = hinge_D_real_loss
|
||||||
|
return_dict['D_hinge_gan_fake_loss'] = hinge_D_fake_loss
|
||||||
|
loss += hinge_D_loss
|
||||||
|
|
||||||
|
return_dict['D_loss'] = loss
|
||||||
|
return return_dict
|
|
@ -0,0 +1,45 @@
|
||||||
|
from torch import nn
|
||||||
|
from torch.nn.utils import weight_norm
|
||||||
|
|
||||||
|
|
||||||
|
class ResidualStack(nn.Module):
|
||||||
|
def __init__(self, channels, num_res_blocks, kernel_size):
|
||||||
|
super(ResidualStack, self).__init__()
|
||||||
|
|
||||||
|
assert (kernel_size - 1) % 2 == 0, " [!] kernel_size has to be odd."
|
||||||
|
base_padding = (kernel_size - 1) // 2
|
||||||
|
|
||||||
|
self.blocks = nn.ModuleList()
|
||||||
|
for idx in range(num_res_blocks):
|
||||||
|
layer_kernel_size = kernel_size
|
||||||
|
layer_dilation = layer_kernel_size**idx
|
||||||
|
layer_padding = base_padding * layer_dilation
|
||||||
|
self.blocks += [nn.Sequential(
|
||||||
|
nn.LeakyReLU(0.2),
|
||||||
|
nn.ReflectionPad1d(layer_padding),
|
||||||
|
weight_norm(
|
||||||
|
nn.Conv1d(channels,
|
||||||
|
channels,
|
||||||
|
kernel_size=kernel_size,
|
||||||
|
dilation=layer_dilation,
|
||||||
|
bias=True)),
|
||||||
|
nn.LeakyReLU(0.2),
|
||||||
|
weight_norm(
|
||||||
|
nn.Conv1d(channels, channels, kernel_size=1, bias=True)),
|
||||||
|
)]
|
||||||
|
|
||||||
|
self.shortcuts = nn.ModuleList([
|
||||||
|
weight_norm(nn.Conv1d(channels, channels, kernel_size=1,
|
||||||
|
bias=True)) for i in range(num_res_blocks)
|
||||||
|
])
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
for block, shortcut in zip(self.blocks, self.shortcuts):
|
||||||
|
x = shortcut(x) + block(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def remove_weight_norm(self):
|
||||||
|
for block, shortcut in zip(self.blocks, self.shortcuts):
|
||||||
|
nn.utils.remove_weight_norm(block[2])
|
||||||
|
nn.utils.remove_weight_norm(block[4])
|
||||||
|
nn.utils.remove_weight_norm(shortcut)
|
|
@ -0,0 +1,56 @@
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
from scipy import signal as sig
|
||||||
|
|
||||||
|
|
||||||
|
# adapted from
|
||||||
|
# https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/parallel_wavegan
|
||||||
|
class PQMF(torch.nn.Module):
|
||||||
|
def __init__(self, N=4, taps=62, cutoff=0.15, beta=9.0):
|
||||||
|
super(PQMF, self).__init__()
|
||||||
|
|
||||||
|
self.N = N
|
||||||
|
self.taps = taps
|
||||||
|
self.cutoff = cutoff
|
||||||
|
self.beta = beta
|
||||||
|
|
||||||
|
QMF = sig.firwin(taps + 1, cutoff, window=('kaiser', beta))
|
||||||
|
H = np.zeros((N, len(QMF)))
|
||||||
|
G = np.zeros((N, len(QMF)))
|
||||||
|
for k in range(N):
|
||||||
|
constant_factor = (2 * k + 1) * (np.pi /
|
||||||
|
(2 * N)) * (np.arange(taps + 1) -
|
||||||
|
((taps - 1) / 2))
|
||||||
|
phase = (-1)**k * np.pi / 4
|
||||||
|
H[k] = 2 * QMF * np.cos(constant_factor + phase)
|
||||||
|
|
||||||
|
G[k] = 2 * QMF * np.cos(constant_factor - phase)
|
||||||
|
|
||||||
|
H = torch.from_numpy(H[:, None, :]).float()
|
||||||
|
G = torch.from_numpy(G[None, :, :]).float()
|
||||||
|
|
||||||
|
self.register_buffer("H", H)
|
||||||
|
self.register_buffer("G", G)
|
||||||
|
|
||||||
|
updown_filter = torch.zeros((N, N, N)).float()
|
||||||
|
for k in range(N):
|
||||||
|
updown_filter[k, k, 0] = 1.0
|
||||||
|
self.register_buffer("updown_filter", updown_filter)
|
||||||
|
self.N = N
|
||||||
|
|
||||||
|
self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return self.analysis(x)
|
||||||
|
|
||||||
|
def analysis(self, x):
|
||||||
|
return F.conv1d(x, self.H, padding=self.taps // 2, stride=self.N)
|
||||||
|
|
||||||
|
def synthesis(self, x):
|
||||||
|
x = F.conv_transpose1d(x,
|
||||||
|
self.updown_filter * self.N,
|
||||||
|
stride=self.N)
|
||||||
|
x = F.conv1d(x, self.G, padding=self.taps // 2)
|
||||||
|
return x
|
|
@ -0,0 +1,640 @@
|
||||||
|
0.0000000e+000
|
||||||
|
-5.5252865e-004
|
||||||
|
-5.6176926e-004
|
||||||
|
-4.9475181e-004
|
||||||
|
-4.8752280e-004
|
||||||
|
-4.8937912e-004
|
||||||
|
-5.0407143e-004
|
||||||
|
-5.2265643e-004
|
||||||
|
-5.4665656e-004
|
||||||
|
-5.6778026e-004
|
||||||
|
-5.8709305e-004
|
||||||
|
-6.1327474e-004
|
||||||
|
-6.3124935e-004
|
||||||
|
-6.5403334e-004
|
||||||
|
-6.7776908e-004
|
||||||
|
-6.9416146e-004
|
||||||
|
-7.1577365e-004
|
||||||
|
-7.2550431e-004
|
||||||
|
-7.4409419e-004
|
||||||
|
-7.4905981e-004
|
||||||
|
-7.6813719e-004
|
||||||
|
-7.7248486e-004
|
||||||
|
-7.8343323e-004
|
||||||
|
-7.7798695e-004
|
||||||
|
-7.8036647e-004
|
||||||
|
-7.8014496e-004
|
||||||
|
-7.7579773e-004
|
||||||
|
-7.6307936e-004
|
||||||
|
-7.5300014e-004
|
||||||
|
-7.3193572e-004
|
||||||
|
-7.2153920e-004
|
||||||
|
-6.9179375e-004
|
||||||
|
-6.6504151e-004
|
||||||
|
-6.3415949e-004
|
||||||
|
-5.9461189e-004
|
||||||
|
-5.5645764e-004
|
||||||
|
-5.1455722e-004
|
||||||
|
-4.6063255e-004
|
||||||
|
-4.0951215e-004
|
||||||
|
-3.5011759e-004
|
||||||
|
-2.8969812e-004
|
||||||
|
-2.0983373e-004
|
||||||
|
-1.4463809e-004
|
||||||
|
-6.1733441e-005
|
||||||
|
1.3494974e-005
|
||||||
|
1.0943831e-004
|
||||||
|
2.0430171e-004
|
||||||
|
2.9495311e-004
|
||||||
|
4.0265402e-004
|
||||||
|
5.1073885e-004
|
||||||
|
6.2393761e-004
|
||||||
|
7.4580259e-004
|
||||||
|
8.6084433e-004
|
||||||
|
9.8859883e-004
|
||||||
|
1.1250155e-003
|
||||||
|
1.2577885e-003
|
||||||
|
1.3902495e-003
|
||||||
|
1.5443220e-003
|
||||||
|
1.6868083e-003
|
||||||
|
1.8348265e-003
|
||||||
|
1.9841141e-003
|
||||||
|
2.1461584e-003
|
||||||
|
2.3017255e-003
|
||||||
|
2.4625617e-003
|
||||||
|
2.6201759e-003
|
||||||
|
2.7870464e-003
|
||||||
|
2.9469448e-003
|
||||||
|
3.1125421e-003
|
||||||
|
3.2739613e-003
|
||||||
|
3.4418874e-003
|
||||||
|
3.6008268e-003
|
||||||
|
3.7603923e-003
|
||||||
|
3.9207432e-003
|
||||||
|
4.0819753e-003
|
||||||
|
4.2264269e-003
|
||||||
|
4.3730720e-003
|
||||||
|
4.5209853e-003
|
||||||
|
4.6606461e-003
|
||||||
|
4.7932561e-003
|
||||||
|
4.9137604e-003
|
||||||
|
5.0393023e-003
|
||||||
|
5.1407354e-003
|
||||||
|
5.2461166e-003
|
||||||
|
5.3471681e-003
|
||||||
|
5.4196776e-003
|
||||||
|
5.4876040e-003
|
||||||
|
5.5475715e-003
|
||||||
|
5.5938023e-003
|
||||||
|
5.6220643e-003
|
||||||
|
5.6455197e-003
|
||||||
|
5.6389200e-003
|
||||||
|
5.6266114e-003
|
||||||
|
5.5917129e-003
|
||||||
|
5.5404364e-003
|
||||||
|
5.4753783e-003
|
||||||
|
5.3838976e-003
|
||||||
|
5.2715759e-003
|
||||||
|
5.1382275e-003
|
||||||
|
4.9839688e-003
|
||||||
|
4.8109469e-003
|
||||||
|
4.6039530e-003
|
||||||
|
4.3801862e-003
|
||||||
|
4.1251642e-003
|
||||||
|
3.8456408e-003
|
||||||
|
3.5401247e-003
|
||||||
|
3.2091886e-003
|
||||||
|
2.8446758e-003
|
||||||
|
2.4508540e-003
|
||||||
|
2.0274176e-003
|
||||||
|
1.5784683e-003
|
||||||
|
1.0902329e-003
|
||||||
|
5.8322642e-004
|
||||||
|
2.7604519e-005
|
||||||
|
-5.4642809e-004
|
||||||
|
-1.1568136e-003
|
||||||
|
-1.8039473e-003
|
||||||
|
-2.4826724e-003
|
||||||
|
-3.1933778e-003
|
||||||
|
-3.9401124e-003
|
||||||
|
-4.7222596e-003
|
||||||
|
-5.5337211e-003
|
||||||
|
-6.3792293e-003
|
||||||
|
-7.2615817e-003
|
||||||
|
-8.1798233e-003
|
||||||
|
-9.1325330e-003
|
||||||
|
-1.0115022e-002
|
||||||
|
-1.1131555e-002
|
||||||
|
-1.2185000e-002
|
||||||
|
-1.3271822e-002
|
||||||
|
-1.4390467e-002
|
||||||
|
-1.5540555e-002
|
||||||
|
-1.6732471e-002
|
||||||
|
-1.7943338e-002
|
||||||
|
-1.9187243e-002
|
||||||
|
-2.0453179e-002
|
||||||
|
-2.1746755e-002
|
||||||
|
-2.3068017e-002
|
||||||
|
-2.4416099e-002
|
||||||
|
-2.5787585e-002
|
||||||
|
-2.7185943e-002
|
||||||
|
-2.8607217e-002
|
||||||
|
-3.0050266e-002
|
||||||
|
-3.1501761e-002
|
||||||
|
-3.2975408e-002
|
||||||
|
-3.4462095e-002
|
||||||
|
-3.5969756e-002
|
||||||
|
-3.7481285e-002
|
||||||
|
-3.9005368e-002
|
||||||
|
-4.0534917e-002
|
||||||
|
-4.2064909e-002
|
||||||
|
-4.3609754e-002
|
||||||
|
-4.5148841e-002
|
||||||
|
-4.6684303e-002
|
||||||
|
-4.8216572e-002
|
||||||
|
-4.9738576e-002
|
||||||
|
-5.1255616e-002
|
||||||
|
-5.2763075e-002
|
||||||
|
-5.4245277e-002
|
||||||
|
-5.5717365e-002
|
||||||
|
-5.7161645e-002
|
||||||
|
-5.8591568e-002
|
||||||
|
-5.9983748e-002
|
||||||
|
-6.1345517e-002
|
||||||
|
-6.2685781e-002
|
||||||
|
-6.3971590e-002
|
||||||
|
-6.5224711e-002
|
||||||
|
-6.6436751e-002
|
||||||
|
-6.7607599e-002
|
||||||
|
-6.8704383e-002
|
||||||
|
-6.9763024e-002
|
||||||
|
-7.0762871e-002
|
||||||
|
-7.1700267e-002
|
||||||
|
-7.2568258e-002
|
||||||
|
-7.3362026e-002
|
||||||
|
-7.4100364e-002
|
||||||
|
-7.4745256e-002
|
||||||
|
-7.5313734e-002
|
||||||
|
-7.5800836e-002
|
||||||
|
-7.6199248e-002
|
||||||
|
-7.6499217e-002
|
||||||
|
-7.6709349e-002
|
||||||
|
-7.6817398e-002
|
||||||
|
-7.6823001e-002
|
||||||
|
-7.6720492e-002
|
||||||
|
-7.6505072e-002
|
||||||
|
-7.6174832e-002
|
||||||
|
-7.5730576e-002
|
||||||
|
-7.5157626e-002
|
||||||
|
-7.4466439e-002
|
||||||
|
-7.3640601e-002
|
||||||
|
-7.2677464e-002
|
||||||
|
-7.1582636e-002
|
||||||
|
-7.0353307e-002
|
||||||
|
-6.8966401e-002
|
||||||
|
-6.7452502e-002
|
||||||
|
-6.5769067e-002
|
||||||
|
-6.3944481e-002
|
||||||
|
-6.1960278e-002
|
||||||
|
-5.9816657e-002
|
||||||
|
-5.7515269e-002
|
||||||
|
-5.5046003e-002
|
||||||
|
-5.2409382e-002
|
||||||
|
-4.9597868e-002
|
||||||
|
-4.6630331e-002
|
||||||
|
-4.3476878e-002
|
||||||
|
-4.0145828e-002
|
||||||
|
-3.6641812e-002
|
||||||
|
-3.2958393e-002
|
||||||
|
-2.9082401e-002
|
||||||
|
-2.5030756e-002
|
||||||
|
-2.0799707e-002
|
||||||
|
-1.6370126e-002
|
||||||
|
-1.1762383e-002
|
||||||
|
-6.9636862e-003
|
||||||
|
-1.9765601e-003
|
||||||
|
3.2086897e-003
|
||||||
|
8.5711749e-003
|
||||||
|
1.4128883e-002
|
||||||
|
1.9883413e-002
|
||||||
|
2.5822729e-002
|
||||||
|
3.1953127e-002
|
||||||
|
3.8277657e-002
|
||||||
|
4.4780682e-002
|
||||||
|
5.1480418e-002
|
||||||
|
5.8370533e-002
|
||||||
|
6.5440985e-002
|
||||||
|
7.2694330e-002
|
||||||
|
8.0137293e-002
|
||||||
|
8.7754754e-002
|
||||||
|
9.5553335e-002
|
||||||
|
1.0353295e-001
|
||||||
|
1.1168269e-001
|
||||||
|
1.2000780e-001
|
||||||
|
1.2850029e-001
|
||||||
|
1.3715518e-001
|
||||||
|
1.4597665e-001
|
||||||
|
1.5496071e-001
|
||||||
|
1.6409589e-001
|
||||||
|
1.7338082e-001
|
||||||
|
1.8281725e-001
|
||||||
|
1.9239667e-001
|
||||||
|
2.0212502e-001
|
||||||
|
2.1197359e-001
|
||||||
|
2.2196527e-001
|
||||||
|
2.3206909e-001
|
||||||
|
2.4230169e-001
|
||||||
|
2.5264803e-001
|
||||||
|
2.6310533e-001
|
||||||
|
2.7366340e-001
|
||||||
|
2.8432142e-001
|
||||||
|
2.9507167e-001
|
||||||
|
3.0590986e-001
|
||||||
|
3.1682789e-001
|
||||||
|
3.2781137e-001
|
||||||
|
3.3887227e-001
|
||||||
|
3.4999141e-001
|
||||||
|
3.6115899e-001
|
||||||
|
3.7237955e-001
|
||||||
|
3.8363500e-001
|
||||||
|
3.9492118e-001
|
||||||
|
4.0623177e-001
|
||||||
|
4.1756969e-001
|
||||||
|
4.2891199e-001
|
||||||
|
4.4025538e-001
|
||||||
|
4.5159965e-001
|
||||||
|
4.6293081e-001
|
||||||
|
4.7424532e-001
|
||||||
|
4.8552531e-001
|
||||||
|
4.9677083e-001
|
||||||
|
5.0798175e-001
|
||||||
|
5.1912350e-001
|
||||||
|
5.3022409e-001
|
||||||
|
5.4125534e-001
|
||||||
|
5.5220513e-001
|
||||||
|
5.6307891e-001
|
||||||
|
5.7385241e-001
|
||||||
|
5.8454032e-001
|
||||||
|
5.9511231e-001
|
||||||
|
6.0557835e-001
|
||||||
|
6.1591099e-001
|
||||||
|
6.2612427e-001
|
||||||
|
6.3619801e-001
|
||||||
|
6.4612697e-001
|
||||||
|
6.5590163e-001
|
||||||
|
6.6551399e-001
|
||||||
|
6.7496632e-001
|
||||||
|
6.8423533e-001
|
||||||
|
6.9332824e-001
|
||||||
|
7.0223887e-001
|
||||||
|
7.1094104e-001
|
||||||
|
7.1944626e-001
|
||||||
|
7.2774489e-001
|
||||||
|
7.3582118e-001
|
||||||
|
7.4368279e-001
|
||||||
|
7.5131375e-001
|
||||||
|
7.5870808e-001
|
||||||
|
7.6586749e-001
|
||||||
|
7.7277809e-001
|
||||||
|
7.7942875e-001
|
||||||
|
7.8583531e-001
|
||||||
|
7.9197358e-001
|
||||||
|
7.9784664e-001
|
||||||
|
8.0344858e-001
|
||||||
|
8.0876950e-001
|
||||||
|
8.1381913e-001
|
||||||
|
8.1857760e-001
|
||||||
|
8.2304199e-001
|
||||||
|
8.2722753e-001
|
||||||
|
8.3110385e-001
|
||||||
|
8.3469374e-001
|
||||||
|
8.3797173e-001
|
||||||
|
8.4095414e-001
|
||||||
|
8.4362383e-001
|
||||||
|
8.4598185e-001
|
||||||
|
8.4803158e-001
|
||||||
|
8.4978052e-001
|
||||||
|
8.5119715e-001
|
||||||
|
8.5230470e-001
|
||||||
|
8.5310209e-001
|
||||||
|
8.5357206e-001
|
||||||
|
8.5373856e-001
|
||||||
|
8.5357206e-001
|
||||||
|
8.5310209e-001
|
||||||
|
8.5230470e-001
|
||||||
|
8.5119715e-001
|
||||||
|
8.4978052e-001
|
||||||
|
8.4803158e-001
|
||||||
|
8.4598185e-001
|
||||||
|
8.4362383e-001
|
||||||
|
8.4095414e-001
|
||||||
|
8.3797173e-001
|
||||||
|
8.3469374e-001
|
||||||
|
8.3110385e-001
|
||||||
|
8.2722753e-001
|
||||||
|
8.2304199e-001
|
||||||
|
8.1857760e-001
|
||||||
|
8.1381913e-001
|
||||||
|
8.0876950e-001
|
||||||
|
8.0344858e-001
|
||||||
|
7.9784664e-001
|
||||||
|
7.9197358e-001
|
||||||
|
7.8583531e-001
|
||||||
|
7.7942875e-001
|
||||||
|
7.7277809e-001
|
||||||
|
7.6586749e-001
|
||||||
|
7.5870808e-001
|
||||||
|
7.5131375e-001
|
||||||
|
7.4368279e-001
|
||||||
|
7.3582118e-001
|
||||||
|
7.2774489e-001
|
||||||
|
7.1944626e-001
|
||||||
|
7.1094104e-001
|
||||||
|
7.0223887e-001
|
||||||
|
6.9332824e-001
|
||||||
|
6.8423533e-001
|
||||||
|
6.7496632e-001
|
||||||
|
6.6551399e-001
|
||||||
|
6.5590163e-001
|
||||||
|
6.4612697e-001
|
||||||
|
6.3619801e-001
|
||||||
|
6.2612427e-001
|
||||||
|
6.1591099e-001
|
||||||
|
6.0557835e-001
|
||||||
|
5.9511231e-001
|
||||||
|
5.8454032e-001
|
||||||
|
5.7385241e-001
|
||||||
|
5.6307891e-001
|
||||||
|
5.5220513e-001
|
||||||
|
5.4125534e-001
|
||||||
|
5.3022409e-001
|
||||||
|
5.1912350e-001
|
||||||
|
5.0798175e-001
|
||||||
|
4.9677083e-001
|
||||||
|
4.8552531e-001
|
||||||
|
4.7424532e-001
|
||||||
|
4.6293081e-001
|
||||||
|
4.5159965e-001
|
||||||
|
4.4025538e-001
|
||||||
|
4.2891199e-001
|
||||||
|
4.1756969e-001
|
||||||
|
4.0623177e-001
|
||||||
|
3.9492118e-001
|
||||||
|
3.8363500e-001
|
||||||
|
3.7237955e-001
|
||||||
|
3.6115899e-001
|
||||||
|
3.4999141e-001
|
||||||
|
3.3887227e-001
|
||||||
|
3.2781137e-001
|
||||||
|
3.1682789e-001
|
||||||
|
3.0590986e-001
|
||||||
|
2.9507167e-001
|
||||||
|
2.8432142e-001
|
||||||
|
2.7366340e-001
|
||||||
|
2.6310533e-001
|
||||||
|
2.5264803e-001
|
||||||
|
2.4230169e-001
|
||||||
|
2.3206909e-001
|
||||||
|
2.2196527e-001
|
||||||
|
2.1197359e-001
|
||||||
|
2.0212502e-001
|
||||||
|
1.9239667e-001
|
||||||
|
1.8281725e-001
|
||||||
|
1.7338082e-001
|
||||||
|
1.6409589e-001
|
||||||
|
1.5496071e-001
|
||||||
|
1.4597665e-001
|
||||||
|
1.3715518e-001
|
||||||
|
1.2850029e-001
|
||||||
|
1.2000780e-001
|
||||||
|
1.1168269e-001
|
||||||
|
1.0353295e-001
|
||||||
|
9.5553335e-002
|
||||||
|
8.7754754e-002
|
||||||
|
8.0137293e-002
|
||||||
|
7.2694330e-002
|
||||||
|
6.5440985e-002
|
||||||
|
5.8370533e-002
|
||||||
|
5.1480418e-002
|
||||||
|
4.4780682e-002
|
||||||
|
3.8277657e-002
|
||||||
|
3.1953127e-002
|
||||||
|
2.5822729e-002
|
||||||
|
1.9883413e-002
|
||||||
|
1.4128883e-002
|
||||||
|
8.5711749e-003
|
||||||
|
3.2086897e-003
|
||||||
|
-1.9765601e-003
|
||||||
|
-6.9636862e-003
|
||||||
|
-1.1762383e-002
|
||||||
|
-1.6370126e-002
|
||||||
|
-2.0799707e-002
|
||||||
|
-2.5030756e-002
|
||||||
|
-2.9082401e-002
|
||||||
|
-3.2958393e-002
|
||||||
|
-3.6641812e-002
|
||||||
|
-4.0145828e-002
|
||||||
|
-4.3476878e-002
|
||||||
|
-4.6630331e-002
|
||||||
|
-4.9597868e-002
|
||||||
|
-5.2409382e-002
|
||||||
|
-5.5046003e-002
|
||||||
|
-5.7515269e-002
|
||||||
|
-5.9816657e-002
|
||||||
|
-6.1960278e-002
|
||||||
|
-6.3944481e-002
|
||||||
|
-6.5769067e-002
|
||||||
|
-6.7452502e-002
|
||||||
|
-6.8966401e-002
|
||||||
|
-7.0353307e-002
|
||||||
|
-7.1582636e-002
|
||||||
|
-7.2677464e-002
|
||||||
|
-7.3640601e-002
|
||||||
|
-7.4466439e-002
|
||||||
|
-7.5157626e-002
|
||||||
|
-7.5730576e-002
|
||||||
|
-7.6174832e-002
|
||||||
|
-7.6505072e-002
|
||||||
|
-7.6720492e-002
|
||||||
|
-7.6823001e-002
|
||||||
|
-7.6817398e-002
|
||||||
|
-7.6709349e-002
|
||||||
|
-7.6499217e-002
|
||||||
|
-7.6199248e-002
|
||||||
|
-7.5800836e-002
|
||||||
|
-7.5313734e-002
|
||||||
|
-7.4745256e-002
|
||||||
|
-7.4100364e-002
|
||||||
|
-7.3362026e-002
|
||||||
|
-7.2568258e-002
|
||||||
|
-7.1700267e-002
|
||||||
|
-7.0762871e-002
|
||||||
|
-6.9763024e-002
|
||||||
|
-6.8704383e-002
|
||||||
|
-6.7607599e-002
|
||||||
|
-6.6436751e-002
|
||||||
|
-6.5224711e-002
|
||||||
|
-6.3971590e-002
|
||||||
|
-6.2685781e-002
|
||||||
|
-6.1345517e-002
|
||||||
|
-5.9983748e-002
|
||||||
|
-5.8591568e-002
|
||||||
|
-5.7161645e-002
|
||||||
|
-5.5717365e-002
|
||||||
|
-5.4245277e-002
|
||||||
|
-5.2763075e-002
|
||||||
|
-5.1255616e-002
|
||||||
|
-4.9738576e-002
|
||||||
|
-4.8216572e-002
|
||||||
|
-4.6684303e-002
|
||||||
|
-4.5148841e-002
|
||||||
|
-4.3609754e-002
|
||||||
|
-4.2064909e-002
|
||||||
|
-4.0534917e-002
|
||||||
|
-3.9005368e-002
|
||||||
|
-3.7481285e-002
|
||||||
|
-3.5969756e-002
|
||||||
|
-3.4462095e-002
|
||||||
|
-3.2975408e-002
|
||||||
|
-3.1501761e-002
|
||||||
|
-3.0050266e-002
|
||||||
|
-2.8607217e-002
|
||||||
|
-2.7185943e-002
|
||||||
|
-2.5787585e-002
|
||||||
|
-2.4416099e-002
|
||||||
|
-2.3068017e-002
|
||||||
|
-2.1746755e-002
|
||||||
|
-2.0453179e-002
|
||||||
|
-1.9187243e-002
|
||||||
|
-1.7943338e-002
|
||||||
|
-1.6732471e-002
|
||||||
|
-1.5540555e-002
|
||||||
|
-1.4390467e-002
|
||||||
|
-1.3271822e-002
|
||||||
|
-1.2185000e-002
|
||||||
|
-1.1131555e-002
|
||||||
|
-1.0115022e-002
|
||||||
|
-9.1325330e-003
|
||||||
|
-8.1798233e-003
|
||||||
|
-7.2615817e-003
|
||||||
|
-6.3792293e-003
|
||||||
|
-5.5337211e-003
|
||||||
|
-4.7222596e-003
|
||||||
|
-3.9401124e-003
|
||||||
|
-3.1933778e-003
|
||||||
|
-2.4826724e-003
|
||||||
|
-1.8039473e-003
|
||||||
|
-1.1568136e-003
|
||||||
|
-5.4642809e-004
|
||||||
|
2.7604519e-005
|
||||||
|
5.8322642e-004
|
||||||
|
1.0902329e-003
|
||||||
|
1.5784683e-003
|
||||||
|
2.0274176e-003
|
||||||
|
2.4508540e-003
|
||||||
|
2.8446758e-003
|
||||||
|
3.2091886e-003
|
||||||
|
3.5401247e-003
|
||||||
|
3.8456408e-003
|
||||||
|
4.1251642e-003
|
||||||
|
4.3801862e-003
|
||||||
|
4.6039530e-003
|
||||||
|
4.8109469e-003
|
||||||
|
4.9839688e-003
|
||||||
|
5.1382275e-003
|
||||||
|
5.2715759e-003
|
||||||
|
5.3838976e-003
|
||||||
|
5.4753783e-003
|
||||||
|
5.5404364e-003
|
||||||
|
5.5917129e-003
|
||||||
|
5.6266114e-003
|
||||||
|
5.6389200e-003
|
||||||
|
5.6455197e-003
|
||||||
|
5.6220643e-003
|
||||||
|
5.5938023e-003
|
||||||
|
5.5475715e-003
|
||||||
|
5.4876040e-003
|
||||||
|
5.4196776e-003
|
||||||
|
5.3471681e-003
|
||||||
|
5.2461166e-003
|
||||||
|
5.1407354e-003
|
||||||
|
5.0393023e-003
|
||||||
|
4.9137604e-003
|
||||||
|
4.7932561e-003
|
||||||
|
4.6606461e-003
|
||||||
|
4.5209853e-003
|
||||||
|
4.3730720e-003
|
||||||
|
4.2264269e-003
|
||||||
|
4.0819753e-003
|
||||||
|
3.9207432e-003
|
||||||
|
3.7603923e-003
|
||||||
|
3.6008268e-003
|
||||||
|
3.4418874e-003
|
||||||
|
3.2739613e-003
|
||||||
|
3.1125421e-003
|
||||||
|
2.9469448e-003
|
||||||
|
2.7870464e-003
|
||||||
|
2.6201759e-003
|
||||||
|
2.4625617e-003
|
||||||
|
2.3017255e-003
|
||||||
|
2.1461584e-003
|
||||||
|
1.9841141e-003
|
||||||
|
1.8348265e-003
|
||||||
|
1.6868083e-003
|
||||||
|
1.5443220e-003
|
||||||
|
1.3902495e-003
|
||||||
|
1.2577885e-003
|
||||||
|
1.1250155e-003
|
||||||
|
9.8859883e-004
|
||||||
|
8.6084433e-004
|
||||||
|
7.4580259e-004
|
||||||
|
6.2393761e-004
|
||||||
|
5.1073885e-004
|
||||||
|
4.0265402e-004
|
||||||
|
2.9495311e-004
|
||||||
|
2.0430171e-004
|
||||||
|
1.0943831e-004
|
||||||
|
1.3494974e-005
|
||||||
|
-6.1733441e-005
|
||||||
|
-1.4463809e-004
|
||||||
|
-2.0983373e-004
|
||||||
|
-2.8969812e-004
|
||||||
|
-3.5011759e-004
|
||||||
|
-4.0951215e-004
|
||||||
|
-4.6063255e-004
|
||||||
|
-5.1455722e-004
|
||||||
|
-5.5645764e-004
|
||||||
|
-5.9461189e-004
|
||||||
|
-6.3415949e-004
|
||||||
|
-6.6504151e-004
|
||||||
|
-6.9179375e-004
|
||||||
|
-7.2153920e-004
|
||||||
|
-7.3193572e-004
|
||||||
|
-7.5300014e-004
|
||||||
|
-7.6307936e-004
|
||||||
|
-7.7579773e-004
|
||||||
|
-7.8014496e-004
|
||||||
|
-7.8036647e-004
|
||||||
|
-7.7798695e-004
|
||||||
|
-7.8343323e-004
|
||||||
|
-7.7248486e-004
|
||||||
|
-7.6813719e-004
|
||||||
|
-7.4905981e-004
|
||||||
|
-7.4409419e-004
|
||||||
|
-7.2550431e-004
|
||||||
|
-7.1577365e-004
|
||||||
|
-6.9416146e-004
|
||||||
|
-6.7776908e-004
|
||||||
|
-6.5403334e-004
|
||||||
|
-6.3124935e-004
|
||||||
|
-6.1327474e-004
|
||||||
|
-5.8709305e-004
|
||||||
|
-5.6778026e-004
|
||||||
|
-5.4665656e-004
|
||||||
|
-5.2265643e-004
|
||||||
|
-5.0407143e-004
|
||||||
|
-4.8937912e-004
|
||||||
|
-4.8752280e-004
|
||||||
|
-4.9475181e-004
|
||||||
|
-5.6176926e-004
|
||||||
|
-5.5252865e-004
|
|
@ -0,0 +1,78 @@
|
||||||
|
import numpy as np
|
||||||
|
from torch import nn
|
||||||
|
from torch.nn.utils import weight_norm
|
||||||
|
|
||||||
|
|
||||||
|
class MelganDiscriminator(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
in_channels=1,
|
||||||
|
out_channels=1,
|
||||||
|
kernel_sizes=(5, 3),
|
||||||
|
base_channels=16,
|
||||||
|
max_channels=1024,
|
||||||
|
downsample_factors=(4, 4, 4, 4)):
|
||||||
|
super(MelganDiscriminator, self).__init__()
|
||||||
|
self.layers = nn.ModuleList()
|
||||||
|
|
||||||
|
layer_kernel_size = np.prod(kernel_sizes)
|
||||||
|
layer_padding = (layer_kernel_size - 1) // 2
|
||||||
|
|
||||||
|
# initial layer
|
||||||
|
self.layers += [
|
||||||
|
nn.Sequential(
|
||||||
|
nn.ReflectionPad1d(layer_padding),
|
||||||
|
weight_norm(
|
||||||
|
nn.Conv1d(in_channels,
|
||||||
|
base_channels,
|
||||||
|
layer_kernel_size,
|
||||||
|
stride=1)), nn.LeakyReLU(0.2, inplace=True))
|
||||||
|
]
|
||||||
|
|
||||||
|
# downsampling layers
|
||||||
|
layer_in_channels = base_channels
|
||||||
|
for downsample_factor in downsample_factors:
|
||||||
|
layer_out_channels = min(layer_in_channels * downsample_factor,
|
||||||
|
max_channels)
|
||||||
|
layer_kernel_size = downsample_factor * 10 + 1
|
||||||
|
layer_padding = (layer_kernel_size - 1) // 2
|
||||||
|
layer_groups = layer_in_channels // 4
|
||||||
|
self.layers += [
|
||||||
|
nn.Sequential(
|
||||||
|
weight_norm(
|
||||||
|
nn.Conv1d(layer_in_channels,
|
||||||
|
layer_out_channels,
|
||||||
|
kernel_size=layer_kernel_size,
|
||||||
|
stride=downsample_factor,
|
||||||
|
padding=layer_padding,
|
||||||
|
groups=layer_groups)),
|
||||||
|
nn.LeakyReLU(0.2, inplace=True))
|
||||||
|
]
|
||||||
|
layer_in_channels = layer_out_channels
|
||||||
|
|
||||||
|
# last 2 layers
|
||||||
|
layer_padding1 = (kernel_sizes[0] - 1) // 2
|
||||||
|
layer_padding2 = (kernel_sizes[1] - 1) // 2
|
||||||
|
self.layers += [
|
||||||
|
nn.Sequential(
|
||||||
|
weight_norm(
|
||||||
|
nn.Conv1d(layer_out_channels,
|
||||||
|
layer_out_channels,
|
||||||
|
kernel_size=kernel_sizes[0],
|
||||||
|
stride=1,
|
||||||
|
padding=layer_padding1)),
|
||||||
|
nn.LeakyReLU(0.2, inplace=True),
|
||||||
|
),
|
||||||
|
weight_norm(
|
||||||
|
nn.Conv1d(layer_out_channels,
|
||||||
|
out_channels,
|
||||||
|
kernel_size=kernel_sizes[1],
|
||||||
|
stride=1,
|
||||||
|
padding=layer_padding2)),
|
||||||
|
]
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
feats = []
|
||||||
|
for layer in self.layers:
|
||||||
|
x = layer(x)
|
||||||
|
feats.append(x)
|
||||||
|
return x, feats
|
|
@ -0,0 +1,98 @@
|
||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
from torch.nn.utils import weight_norm
|
||||||
|
|
||||||
|
from TTS.vocoder.layers.melgan import ResidualStack
|
||||||
|
|
||||||
|
|
||||||
|
class MelganGenerator(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
in_channels=80,
|
||||||
|
out_channels=1,
|
||||||
|
proj_kernel=7,
|
||||||
|
base_channels=512,
|
||||||
|
upsample_factors=(8, 8, 2, 2),
|
||||||
|
res_kernel=3,
|
||||||
|
num_res_blocks=3):
|
||||||
|
super(MelganGenerator, self).__init__()
|
||||||
|
|
||||||
|
# assert model parameters
|
||||||
|
assert (proj_kernel -
|
||||||
|
1) % 2 == 0, " [!] proj_kernel should be an odd number."
|
||||||
|
|
||||||
|
# setup additional model parameters
|
||||||
|
base_padding = (proj_kernel - 1) // 2
|
||||||
|
act_slope = 0.2
|
||||||
|
self.inference_padding = 2
|
||||||
|
|
||||||
|
# initial layer
|
||||||
|
layers = []
|
||||||
|
layers += [
|
||||||
|
nn.ReflectionPad1d(base_padding),
|
||||||
|
weight_norm(
|
||||||
|
nn.Conv1d(in_channels,
|
||||||
|
base_channels,
|
||||||
|
kernel_size=proj_kernel,
|
||||||
|
stride=1,
|
||||||
|
bias=True))
|
||||||
|
]
|
||||||
|
|
||||||
|
# upsampling layers and residual stacks
|
||||||
|
for idx, upsample_factor in enumerate(upsample_factors):
|
||||||
|
layer_in_channels = base_channels // (2**idx)
|
||||||
|
layer_out_channels = base_channels // (2**(idx + 1))
|
||||||
|
layer_filter_size = upsample_factor * 2
|
||||||
|
layer_stride = upsample_factor
|
||||||
|
layer_output_padding = upsample_factor % 2
|
||||||
|
layer_padding = upsample_factor // 2 + layer_output_padding
|
||||||
|
layers += [
|
||||||
|
nn.LeakyReLU(act_slope),
|
||||||
|
weight_norm(
|
||||||
|
nn.ConvTranspose1d(layer_in_channels,
|
||||||
|
layer_out_channels,
|
||||||
|
layer_filter_size,
|
||||||
|
stride=layer_stride,
|
||||||
|
padding=layer_padding,
|
||||||
|
output_padding=layer_output_padding,
|
||||||
|
bias=True)),
|
||||||
|
ResidualStack(
|
||||||
|
channels=layer_out_channels,
|
||||||
|
num_res_blocks=num_res_blocks,
|
||||||
|
kernel_size=res_kernel
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
layers += [nn.LeakyReLU(act_slope)]
|
||||||
|
|
||||||
|
# final layer
|
||||||
|
layers += [
|
||||||
|
nn.ReflectionPad1d(base_padding),
|
||||||
|
weight_norm(
|
||||||
|
nn.Conv1d(layer_out_channels,
|
||||||
|
out_channels,
|
||||||
|
proj_kernel,
|
||||||
|
stride=1,
|
||||||
|
bias=True)),
|
||||||
|
nn.Tanh()
|
||||||
|
]
|
||||||
|
self.layers = nn.Sequential(*layers)
|
||||||
|
|
||||||
|
def forward(self, c):
|
||||||
|
return self.layers(c)
|
||||||
|
|
||||||
|
def inference(self, c):
|
||||||
|
c = c.to(self.layers[1].weight.device)
|
||||||
|
c = torch.nn.functional.pad(
|
||||||
|
c,
|
||||||
|
(self.inference_padding, self.inference_padding),
|
||||||
|
'replicate')
|
||||||
|
return self.layers(c)
|
||||||
|
|
||||||
|
def remove_weight_norm(self):
|
||||||
|
for _, layer in enumerate(self.layers):
|
||||||
|
if len(layer.state_dict()) != 0:
|
||||||
|
try:
|
||||||
|
nn.utils.remove_weight_norm(layer)
|
||||||
|
except ValueError:
|
||||||
|
layer.remove_weight_norm()
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
from TTS.vocoder.models.melgan_discriminator import MelganDiscriminator
|
||||||
|
|
||||||
|
|
||||||
|
class MelganMultiscaleDiscriminator(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
in_channels=1,
|
||||||
|
out_channels=1,
|
||||||
|
num_scales=3,
|
||||||
|
kernel_sizes=(5, 3),
|
||||||
|
base_channels=16,
|
||||||
|
max_channels=1024,
|
||||||
|
downsample_factors=(4, 4, 4),
|
||||||
|
pooling_kernel_size=4,
|
||||||
|
pooling_stride=2,
|
||||||
|
pooling_padding=1):
|
||||||
|
super(MelganMultiscaleDiscriminator, self).__init__()
|
||||||
|
|
||||||
|
self.discriminators = nn.ModuleList([
|
||||||
|
MelganDiscriminator(in_channels=in_channels,
|
||||||
|
out_channels=out_channels,
|
||||||
|
kernel_sizes=kernel_sizes,
|
||||||
|
base_channels=base_channels,
|
||||||
|
max_channels=max_channels,
|
||||||
|
downsample_factors=downsample_factors)
|
||||||
|
for _ in range(num_scales)
|
||||||
|
])
|
||||||
|
|
||||||
|
self.pooling = nn.AvgPool1d(kernel_size=pooling_kernel_size, stride=pooling_stride, padding=pooling_padding, count_include_pad=False)
|
||||||
|
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
scores = list()
|
||||||
|
feats = list()
|
||||||
|
for disc in self.discriminators:
|
||||||
|
score, feat = disc(x)
|
||||||
|
scores.append(score)
|
||||||
|
feats.append(feat)
|
||||||
|
x = self.pooling(x)
|
||||||
|
return scores, feats
|
|
@ -0,0 +1,39 @@
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from TTS.vocoder.models.melgan_generator import MelganGenerator
|
||||||
|
from TTS.vocoder.layers.pqmf import PQMF
|
||||||
|
|
||||||
|
|
||||||
|
class MultibandMelganGenerator(MelganGenerator):
|
||||||
|
def __init__(self,
|
||||||
|
in_channels=80,
|
||||||
|
out_channels=4,
|
||||||
|
proj_kernel=7,
|
||||||
|
base_channels=384,
|
||||||
|
upsample_factors=(2, 8, 2, 2),
|
||||||
|
res_kernel=3,
|
||||||
|
num_res_blocks=3):
|
||||||
|
super(MultibandMelganGenerator,
|
||||||
|
self).__init__(in_channels=in_channels,
|
||||||
|
out_channels=out_channels,
|
||||||
|
proj_kernel=proj_kernel,
|
||||||
|
base_channels=base_channels,
|
||||||
|
upsample_factors=upsample_factors,
|
||||||
|
res_kernel=res_kernel,
|
||||||
|
num_res_blocks=num_res_blocks)
|
||||||
|
self.pqmf_layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0)
|
||||||
|
|
||||||
|
def pqmf_analysis(self, x):
|
||||||
|
return self.pqmf_layer.analysis(x)
|
||||||
|
|
||||||
|
def pqmf_synthesis(self, x):
|
||||||
|
return self.pqmf_layer.synthesis(x)
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def inference(self, cond_features):
|
||||||
|
cond_features = cond_features.to(self.layers[1].weight.device)
|
||||||
|
cond_features = torch.nn.functional.pad(
|
||||||
|
cond_features,
|
||||||
|
(self.inference_padding, self.inference_padding),
|
||||||
|
'replicate')
|
||||||
|
return self.pqmf_synthesis(self.layers(cond_features))
|
|
@ -0,0 +1,225 @@
|
||||||
|
import numpy as np
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
|
||||||
|
class GBlock(nn.Module):
|
||||||
|
def __init__(self, in_channels, cond_channels, downsample_factor):
|
||||||
|
super(GBlock, self).__init__()
|
||||||
|
|
||||||
|
self.in_channels = in_channels
|
||||||
|
self.cond_channels = cond_channels
|
||||||
|
self.downsample_factor = downsample_factor
|
||||||
|
|
||||||
|
self.start = nn.Sequential(
|
||||||
|
nn.AvgPool1d(downsample_factor, stride=downsample_factor),
|
||||||
|
nn.ReLU(),
|
||||||
|
nn.Conv1d(in_channels, in_channels * 2, kernel_size=3, padding=1))
|
||||||
|
self.lc_conv1d = nn.Conv1d(cond_channels,
|
||||||
|
in_channels * 2,
|
||||||
|
kernel_size=1)
|
||||||
|
self.end = nn.Sequential(
|
||||||
|
nn.ReLU(),
|
||||||
|
nn.Conv1d(in_channels * 2,
|
||||||
|
in_channels * 2,
|
||||||
|
kernel_size=3,
|
||||||
|
dilation=2,
|
||||||
|
padding=2))
|
||||||
|
self.residual = nn.Sequential(
|
||||||
|
nn.Conv1d(in_channels, in_channels * 2, kernel_size=1),
|
||||||
|
nn.AvgPool1d(downsample_factor, stride=downsample_factor))
|
||||||
|
|
||||||
|
def forward(self, inputs, conditions):
|
||||||
|
outputs = self.start(inputs) + self.lc_conv1d(conditions)
|
||||||
|
outputs = self.end(outputs)
|
||||||
|
residual_outputs = self.residual(inputs)
|
||||||
|
outputs = outputs + residual_outputs
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class DBlock(nn.Module):
|
||||||
|
def __init__(self, in_channels, out_channels, downsample_factor):
|
||||||
|
super(DBlock, self).__init__()
|
||||||
|
|
||||||
|
self.in_channels = in_channels
|
||||||
|
self.downsample_factor = downsample_factor
|
||||||
|
self.out_channels = out_channels
|
||||||
|
|
||||||
|
self.donwsample_layer = nn.AvgPool1d(downsample_factor,
|
||||||
|
stride=downsample_factor)
|
||||||
|
self.layers = nn.Sequential(
|
||||||
|
nn.ReLU(),
|
||||||
|
nn.Conv1d(in_channels, out_channels, kernel_size=3, padding=1),
|
||||||
|
nn.ReLU(),
|
||||||
|
nn.Conv1d(out_channels,
|
||||||
|
out_channels,
|
||||||
|
kernel_size=3,
|
||||||
|
dilation=2,
|
||||||
|
padding=2))
|
||||||
|
self.residual = nn.Sequential(
|
||||||
|
nn.Conv1d(in_channels, out_channels, kernel_size=1), )
|
||||||
|
|
||||||
|
def forward(self, inputs):
|
||||||
|
if self.downsample_factor > 1:
|
||||||
|
outputs = self.layers(self.donwsample_layer(inputs))\
|
||||||
|
+ self.donwsample_layer(self.residual(inputs))
|
||||||
|
else:
|
||||||
|
outputs = self.layers(inputs) + self.residual(inputs)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class ConditionalDiscriminator(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
in_channels,
|
||||||
|
cond_channels,
|
||||||
|
downsample_factors=(2, 2, 2),
|
||||||
|
out_channels=(128, 256)):
|
||||||
|
super(ConditionalDiscriminator, self).__init__()
|
||||||
|
|
||||||
|
assert len(downsample_factors) == len(out_channels) + 1
|
||||||
|
|
||||||
|
self.in_channels = in_channels
|
||||||
|
self.cond_channels = cond_channels
|
||||||
|
self.downsample_factors = downsample_factors
|
||||||
|
self.out_channels = out_channels
|
||||||
|
|
||||||
|
self.pre_cond_layers = nn.ModuleList()
|
||||||
|
self.post_cond_layers = nn.ModuleList()
|
||||||
|
|
||||||
|
# layers before condition features
|
||||||
|
self.pre_cond_layers += [DBlock(in_channels, 64, 1)]
|
||||||
|
in_channels = 64
|
||||||
|
for (i, channel) in enumerate(out_channels):
|
||||||
|
self.pre_cond_layers.append(
|
||||||
|
DBlock(in_channels, channel, downsample_factors[i]))
|
||||||
|
in_channels = channel
|
||||||
|
|
||||||
|
# condition block
|
||||||
|
self.cond_block = GBlock(in_channels, cond_channels,
|
||||||
|
downsample_factors[-1])
|
||||||
|
|
||||||
|
# layers after condition block
|
||||||
|
self.post_cond_layers += [
|
||||||
|
DBlock(in_channels * 2, in_channels * 2, 1),
|
||||||
|
DBlock(in_channels * 2, in_channels * 2, 1),
|
||||||
|
nn.AdaptiveAvgPool1d(1),
|
||||||
|
nn.Conv1d(in_channels * 2, 1, kernel_size=1),
|
||||||
|
]
|
||||||
|
|
||||||
|
def forward(self, inputs, conditions):
|
||||||
|
batch_size = inputs.size()[0]
|
||||||
|
outputs = inputs.view(batch_size, self.in_channels, -1)
|
||||||
|
for layer in self.pre_cond_layers:
|
||||||
|
outputs = layer(outputs)
|
||||||
|
outputs = self.cond_block(outputs, conditions)
|
||||||
|
for layer in self.post_cond_layers:
|
||||||
|
outputs = layer(outputs)
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class UnconditionalDiscriminator(nn.Module):
|
||||||
|
def __init__(self,
|
||||||
|
in_channels,
|
||||||
|
base_channels=64,
|
||||||
|
downsample_factors=(8, 4),
|
||||||
|
out_channels=(128, 256)):
|
||||||
|
super(UnconditionalDiscriminator, self).__init__()
|
||||||
|
|
||||||
|
self.downsample_factors = downsample_factors
|
||||||
|
self.in_channels = in_channels
|
||||||
|
self.downsample_factors = downsample_factors
|
||||||
|
self.out_channels = out_channels
|
||||||
|
|
||||||
|
self.layers = nn.ModuleList()
|
||||||
|
self.layers += [DBlock(self.in_channels, base_channels, 1)]
|
||||||
|
in_channels = base_channels
|
||||||
|
for (i, factor) in enumerate(downsample_factors):
|
||||||
|
self.layers.append(DBlock(in_channels, out_channels[i], factor))
|
||||||
|
in_channels *= 2
|
||||||
|
self.layers += [
|
||||||
|
DBlock(in_channels, in_channels, 1),
|
||||||
|
DBlock(in_channels, in_channels, 1),
|
||||||
|
nn.AdaptiveAvgPool1d(1),
|
||||||
|
nn.Conv1d(in_channels, 1, kernel_size=1),
|
||||||
|
]
|
||||||
|
|
||||||
|
def forward(self, inputs):
|
||||||
|
batch_size = inputs.size()[0]
|
||||||
|
outputs = inputs.view(batch_size, self.in_channels, -1)
|
||||||
|
for layer in self.layers:
|
||||||
|
outputs = layer(outputs)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
class RandomWindowDiscriminator(nn.Module):
|
||||||
|
"""Random Window Discriminator as described in
|
||||||
|
http://arxiv.org/abs/1909.11646"""
|
||||||
|
def __init__(self,
|
||||||
|
cond_channels,
|
||||||
|
hop_length,
|
||||||
|
uncond_disc_donwsample_factors=(8, 4),
|
||||||
|
cond_disc_downsample_factors=((8, 4, 2, 2, 2), (8, 4, 2, 2),
|
||||||
|
(8, 4, 2), (8, 4), (4, 2, 2)),
|
||||||
|
cond_disc_out_channels=((128, 128, 256, 256), (128, 256, 256),
|
||||||
|
(128, 256), (256, ), (128, 256)),
|
||||||
|
window_sizes=(512, 1024, 2048, 4096, 8192)):
|
||||||
|
|
||||||
|
super(RandomWindowDiscriminator, self).__init__()
|
||||||
|
self.cond_channels = cond_channels
|
||||||
|
self.window_sizes = window_sizes
|
||||||
|
self.hop_length = hop_length
|
||||||
|
self.base_window_size = self.hop_length * 2
|
||||||
|
self.ks = [ws // self.base_window_size for ws in window_sizes]
|
||||||
|
|
||||||
|
# check arguments
|
||||||
|
assert len(cond_disc_downsample_factors) == len(
|
||||||
|
cond_disc_out_channels) == len(window_sizes)
|
||||||
|
for ws in window_sizes:
|
||||||
|
assert ws % hop_length == 0
|
||||||
|
|
||||||
|
for idx, cf in enumerate(cond_disc_downsample_factors):
|
||||||
|
assert np.prod(cf) == hop_length // self.ks[idx]
|
||||||
|
|
||||||
|
# define layers
|
||||||
|
self.unconditional_discriminators = nn.ModuleList([])
|
||||||
|
for k in self.ks:
|
||||||
|
layer = UnconditionalDiscriminator(
|
||||||
|
in_channels=k,
|
||||||
|
base_channels=64,
|
||||||
|
downsample_factors=uncond_disc_donwsample_factors)
|
||||||
|
self.unconditional_discriminators.append(layer)
|
||||||
|
|
||||||
|
self.conditional_discriminators = nn.ModuleList([])
|
||||||
|
for idx, k in enumerate(self.ks):
|
||||||
|
layer = ConditionalDiscriminator(
|
||||||
|
in_channels=k,
|
||||||
|
cond_channels=cond_channels,
|
||||||
|
downsample_factors=cond_disc_downsample_factors[idx],
|
||||||
|
out_channels=cond_disc_out_channels[idx])
|
||||||
|
self.conditional_discriminators.append(layer)
|
||||||
|
|
||||||
|
def forward(self, x, c):
|
||||||
|
scores = []
|
||||||
|
feats = []
|
||||||
|
# unconditional pass
|
||||||
|
for (window_size, layer) in zip(self.window_sizes,
|
||||||
|
self.unconditional_discriminators):
|
||||||
|
index = np.random.randint(x.shape[-1] - window_size)
|
||||||
|
|
||||||
|
score = layer(x[:, :, index:index + window_size])
|
||||||
|
scores.append(score)
|
||||||
|
|
||||||
|
# conditional pass
|
||||||
|
for (window_size, layer) in zip(self.window_sizes,
|
||||||
|
self.conditional_discriminators):
|
||||||
|
frame_size = window_size // self.hop_length
|
||||||
|
lc_index = np.random.randint(c.shape[-1] - frame_size)
|
||||||
|
sample_index = lc_index * self.hop_length
|
||||||
|
x_sub = x[:, :,
|
||||||
|
sample_index:(lc_index + frame_size) * self.hop_length]
|
||||||
|
c_sub = c[:, :, lc_index:lc_index + frame_size]
|
||||||
|
|
||||||
|
score = layer(x_sub, c_sub)
|
||||||
|
scores.append(score)
|
||||||
|
return scores, feats
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,6 @@
|
||||||
|
{
|
||||||
|
"cells": [],
|
||||||
|
"metadata": {},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
Binary file not shown.
|
@ -0,0 +1 @@
|
||||||
|
|
|
@ -0,0 +1,24 @@
|
||||||
|
{
|
||||||
|
"audio":{
|
||||||
|
"num_mels": 80, // size of the mel spec frame.
|
||||||
|
"num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||||
|
"sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled.
|
||||||
|
"frame_length_ms": null, // stft window length in ms.
|
||||||
|
"frame_shift_ms": null, // stft window hop-lengh in ms.
|
||||||
|
"hop_length": 256,
|
||||||
|
"win_length": 1024,
|
||||||
|
"preemphasis": 0.97, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||||
|
"min_level_db": -100, // normalization range
|
||||||
|
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
||||||
|
"power": 1.5, // value to sharpen wav signals after GL algorithm.
|
||||||
|
"griffin_lim_iters": 30,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
||||||
|
"signal_norm": true, // normalize the spec values in range [0, 1]
|
||||||
|
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||||
|
"clip_norm": true, // clip normalized values into the range.
|
||||||
|
"max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||||
|
"mel_fmin": 0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||||
|
"mel_fmax": 8000, // maximum freq level for mel-spec. Tune for dataset!!
|
||||||
|
"do_trim_silence": false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,95 @@
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
|
from TTS.vocoder.datasets.gan_dataset import GANDataset
|
||||||
|
from TTS.vocoder.datasets.preprocess import load_wav_data
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
from TTS.utils.io import load_config
|
||||||
|
|
||||||
|
|
||||||
|
file_path = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
OUTPATH = os.path.join(file_path, "../../tests/outputs/loader_tests/")
|
||||||
|
os.makedirs(OUTPATH, exist_ok=True)
|
||||||
|
|
||||||
|
C = load_config(os.path.join(file_path, 'test_config.json'))
|
||||||
|
|
||||||
|
test_data_path = os.path.join(file_path, "../../tests/data/ljspeech/")
|
||||||
|
ok_ljspeech = os.path.exists(test_data_path)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def gan_dataset_case(batch_size, seq_len, hop_len, conv_pad, return_segments, use_noise_augment, use_cache, num_workers):
|
||||||
|
''' run dataloader with given parameters and check conditions '''
|
||||||
|
ap = AudioProcessor(**C.audio)
|
||||||
|
_, train_items = load_wav_data(test_data_path, 10)
|
||||||
|
dataset = GANDataset(ap,
|
||||||
|
train_items,
|
||||||
|
seq_len=seq_len,
|
||||||
|
hop_len=hop_len,
|
||||||
|
pad_short=2000,
|
||||||
|
conv_pad=conv_pad,
|
||||||
|
return_segments=return_segments,
|
||||||
|
use_noise_augment=use_noise_augment,
|
||||||
|
use_cache=use_cache)
|
||||||
|
loader = DataLoader(dataset=dataset,
|
||||||
|
batch_size=batch_size,
|
||||||
|
shuffle=True,
|
||||||
|
num_workers=num_workers,
|
||||||
|
pin_memory=True,
|
||||||
|
drop_last=True)
|
||||||
|
|
||||||
|
max_iter = 10
|
||||||
|
count_iter = 0
|
||||||
|
|
||||||
|
# return random segments or return the whole audio
|
||||||
|
if return_segments:
|
||||||
|
for item1, _ in loader:
|
||||||
|
feat1, wav1 = item1
|
||||||
|
# feat2, wav2 = item2
|
||||||
|
expected_feat_shape = (batch_size, ap.num_mels, seq_len // hop_len + conv_pad * 2)
|
||||||
|
|
||||||
|
# check shapes
|
||||||
|
assert np.all(feat1.shape == expected_feat_shape), f" [!] {feat1.shape} vs {expected_feat_shape}"
|
||||||
|
assert (feat1.shape[2] - conv_pad * 2) * hop_len == wav1.shape[2]
|
||||||
|
|
||||||
|
# check feature vs audio match
|
||||||
|
if not use_noise_augment:
|
||||||
|
for idx in range(batch_size):
|
||||||
|
audio = wav1[idx].squeeze()
|
||||||
|
feat = feat1[idx]
|
||||||
|
mel = ap.melspectrogram(audio)
|
||||||
|
# the first 2 and the last frame is skipped due to the padding
|
||||||
|
# applied in spec. computation.
|
||||||
|
assert (feat - mel[:, :feat1.shape[-1]])[:, 2:-1].sum() == 0, f' [!] {(feat - mel[:, :feat1.shape[-1]])[:, 2:-1].sum()}'
|
||||||
|
|
||||||
|
count_iter += 1
|
||||||
|
# if count_iter == max_iter:
|
||||||
|
# break
|
||||||
|
else:
|
||||||
|
for item in loader:
|
||||||
|
feat, wav = item
|
||||||
|
expected_feat_shape = (batch_size, ap.num_mels, (wav.shape[-1] // hop_len) + (conv_pad * 2))
|
||||||
|
assert np.all(feat.shape == expected_feat_shape), f" [!] {feat.shape} vs {expected_feat_shape}"
|
||||||
|
assert (feat.shape[2] - conv_pad * 2) * hop_len == wav.shape[2]
|
||||||
|
count_iter += 1
|
||||||
|
if count_iter == max_iter:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
def test_parametrized_gan_dataset():
|
||||||
|
''' test dataloader with different parameters '''
|
||||||
|
params = [
|
||||||
|
[32, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, True, False, True, 0],
|
||||||
|
[32, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, True, False, True, 4],
|
||||||
|
[1, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, True, True, True, 0],
|
||||||
|
[1, C.audio['hop_length'], C.audio['hop_length'], 0, True, True, True, 0],
|
||||||
|
[1, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, True, True, True, 0],
|
||||||
|
[1, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, False, True, True, 0],
|
||||||
|
[1, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, True, False, True, 0],
|
||||||
|
[1, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, True, True, False, 0],
|
||||||
|
[1, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, False, False, False, 0],
|
||||||
|
]
|
||||||
|
for param in params:
|
||||||
|
print(param)
|
||||||
|
gan_dataset_case(*param)
|
|
@ -0,0 +1,61 @@
|
||||||
|
import os
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from TTS.vocoder.layers.losses import TorchSTFT, STFTLoss, MultiScaleSTFTLoss
|
||||||
|
|
||||||
|
from TTS.tests import get_tests_path, get_tests_input_path, get_tests_output_path
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
from TTS.utils.io import load_config
|
||||||
|
|
||||||
|
TESTS_PATH = get_tests_path()
|
||||||
|
|
||||||
|
OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
|
||||||
|
os.makedirs(OUT_PATH, exist_ok=True)
|
||||||
|
|
||||||
|
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
|
||||||
|
|
||||||
|
file_path = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
C = load_config(os.path.join(file_path, 'test_config.json'))
|
||||||
|
ap = AudioProcessor(**C.audio)
|
||||||
|
|
||||||
|
|
||||||
|
def test_torch_stft():
|
||||||
|
torch_stft = TorchSTFT(ap.n_fft, ap.hop_length, ap.win_length)
|
||||||
|
# librosa stft
|
||||||
|
wav = ap.load_wav(WAV_FILE)
|
||||||
|
M_librosa = abs(ap._stft(wav)) # pylint: disable=protected-access
|
||||||
|
# torch stft
|
||||||
|
wav = torch.from_numpy(wav[None, :]).float()
|
||||||
|
M_torch = torch_stft(wav)
|
||||||
|
# check the difference b/w librosa and torch outputs
|
||||||
|
assert (M_librosa - M_torch[0].data.numpy()).max() < 1e-5
|
||||||
|
|
||||||
|
|
||||||
|
def test_stft_loss():
|
||||||
|
stft_loss = STFTLoss(ap.n_fft, ap.hop_length, ap.win_length)
|
||||||
|
wav = ap.load_wav(WAV_FILE)
|
||||||
|
wav = torch.from_numpy(wav[None, :]).float()
|
||||||
|
loss_m, loss_sc = stft_loss(wav, wav)
|
||||||
|
assert loss_m + loss_sc == 0
|
||||||
|
loss_m, loss_sc = stft_loss(wav, torch.rand_like(wav))
|
||||||
|
assert loss_sc < 1.0
|
||||||
|
assert loss_m + loss_sc > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_multiscale_stft_loss():
|
||||||
|
stft_loss = MultiScaleSTFTLoss([ap.n_fft//2, ap.n_fft, ap.n_fft*2],
|
||||||
|
[ap.hop_length // 2, ap.hop_length, ap.hop_length * 2],
|
||||||
|
[ap.win_length // 2, ap.win_length, ap.win_length * 2])
|
||||||
|
wav = ap.load_wav(WAV_FILE)
|
||||||
|
wav = torch.from_numpy(wav[None, :]).float()
|
||||||
|
loss_m, loss_sc = stft_loss(wav, wav)
|
||||||
|
assert loss_m + loss_sc == 0
|
||||||
|
loss_m, loss_sc = stft_loss(wav, torch.rand_like(wav))
|
||||||
|
assert loss_sc < 1.0
|
||||||
|
assert loss_m + loss_sc > 0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,26 @@
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from TTS.vocoder.models.melgan_discriminator import MelganDiscriminator
|
||||||
|
from TTS.vocoder.models.melgan_multiscale_discriminator import MelganMultiscaleDiscriminator
|
||||||
|
|
||||||
|
|
||||||
|
def test_melgan_discriminator():
|
||||||
|
model = MelganDiscriminator()
|
||||||
|
print(model)
|
||||||
|
dummy_input = torch.rand((4, 1, 256 * 10))
|
||||||
|
output, _ = model(dummy_input)
|
||||||
|
assert np.all(output.shape == (4, 1, 10))
|
||||||
|
|
||||||
|
|
||||||
|
def test_melgan_multi_scale_discriminator():
|
||||||
|
model = MelganMultiscaleDiscriminator()
|
||||||
|
print(model)
|
||||||
|
dummy_input = torch.rand((4, 1, 256 * 16))
|
||||||
|
scores, feats = model(dummy_input)
|
||||||
|
assert len(scores) == 3
|
||||||
|
assert len(scores) == len(feats)
|
||||||
|
assert np.all(scores[0].shape == (4, 1, 64))
|
||||||
|
assert np.all(feats[0][0].shape == (4, 16, 4096))
|
||||||
|
assert np.all(feats[0][1].shape == (4, 64, 1024))
|
||||||
|
assert np.all(feats[0][2].shape == (4, 256, 256))
|
|
@ -0,0 +1,14 @@
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from TTS.vocoder.models.melgan_generator import MelganGenerator
|
||||||
|
|
||||||
|
def test_melgan_generator():
|
||||||
|
model = MelganGenerator()
|
||||||
|
print(model)
|
||||||
|
dummy_input = torch.rand((4, 80, 64))
|
||||||
|
output = model(dummy_input)
|
||||||
|
assert np.all(output.shape == (4, 1, 64 * 256))
|
||||||
|
output = model.inference(dummy_input)
|
||||||
|
assert np.all(output.shape == (4, 1, (64 + 4) * 256))
|
||||||
|
|
|
@ -0,0 +1,28 @@
|
||||||
|
import os
|
||||||
|
import torch
|
||||||
|
|
||||||
|
import soundfile as sf
|
||||||
|
from librosa.core import load
|
||||||
|
|
||||||
|
from TTS.tests import get_tests_path, get_tests_input_path
|
||||||
|
from TTS.vocoder.layers.pqmf import PQMF
|
||||||
|
|
||||||
|
|
||||||
|
TESTS_PATH = get_tests_path()
|
||||||
|
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
|
||||||
|
|
||||||
|
|
||||||
|
def test_pqmf():
|
||||||
|
w, sr = load(WAV_FILE)
|
||||||
|
|
||||||
|
layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0)
|
||||||
|
w, sr = load(WAV_FILE)
|
||||||
|
w2 = torch.from_numpy(w[None, None, :])
|
||||||
|
b2 = layer.analysis(w2)
|
||||||
|
w2_ = layer.synthesis(b2)
|
||||||
|
|
||||||
|
print(w2_.max())
|
||||||
|
print(w2_.min())
|
||||||
|
print(w2_.mean())
|
||||||
|
sf.write('pqmf_output.wav', w2_.flatten().detach(), sr)
|
||||||
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from TTS.vocoder.models.random_window_discriminator import RandomWindowDiscriminator
|
||||||
|
|
||||||
|
|
||||||
|
def test_rwd():
|
||||||
|
layer = RandomWindowDiscriminator(cond_channels=80,
|
||||||
|
window_sizes=(512, 1024, 2048, 4096,
|
||||||
|
8192),
|
||||||
|
cond_disc_downsample_factors=[
|
||||||
|
(8, 4, 2, 2, 2), (8, 4, 2, 2),
|
||||||
|
(8, 4, 2), (8, 4), (4, 2, 2)
|
||||||
|
],
|
||||||
|
hop_length=256)
|
||||||
|
x = torch.rand([4, 1, 22050])
|
||||||
|
c = torch.rand([4, 80, 22050 // 256])
|
||||||
|
|
||||||
|
scores, _ = layer(x, c)
|
||||||
|
assert len(scores) == 10
|
||||||
|
assert np.all(scores[0].shape == (4, 1, 1))
|
|
@ -0,0 +1,112 @@
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
import torch
|
||||||
|
from fuzzywuzzy import fuzz
|
||||||
|
|
||||||
|
from TTS.utils.io import load_config
|
||||||
|
from TTS.vocoder.tf.utils.convert_torch_to_tf_utils import (
|
||||||
|
compare_torch_tf, convert_tf_name, transfer_weights_torch_to_tf)
|
||||||
|
from TTS.vocoder.tf.utils.generic_utils import \
|
||||||
|
setup_generator as setup_tf_generator
|
||||||
|
from TTS.vocoder.tf.utils.io import save_checkpoint
|
||||||
|
from TTS.vocoder.utils.generic_utils import setup_generator
|
||||||
|
|
||||||
|
# prevent GPU use
|
||||||
|
os.environ['CUDA_VISIBLE_DEVICES'] = ''
|
||||||
|
|
||||||
|
# define args
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--torch_model_path',
|
||||||
|
type=str,
|
||||||
|
help='Path to target torch model to be converted to TF.')
|
||||||
|
parser.add_argument('--config_path',
|
||||||
|
type=str,
|
||||||
|
help='Path to config file of torch model.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--output_path',
|
||||||
|
type=str,
|
||||||
|
help='path to output file including file name to save TF model.')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# load model config
|
||||||
|
config_path = args.config_path
|
||||||
|
c = load_config(config_path)
|
||||||
|
num_speakers = 0
|
||||||
|
|
||||||
|
# init torch model
|
||||||
|
model = setup_generator(c)
|
||||||
|
checkpoint = torch.load(args.torch_model_path,
|
||||||
|
map_location=torch.device('cpu'))
|
||||||
|
state_dict = checkpoint['model']
|
||||||
|
model.load_state_dict(state_dict)
|
||||||
|
model.remove_weight_norm()
|
||||||
|
state_dict = model.state_dict()
|
||||||
|
|
||||||
|
# init tf model
|
||||||
|
model_tf = setup_tf_generator(c)
|
||||||
|
|
||||||
|
common_sufix = '/.ATTRIBUTES/VARIABLE_VALUE'
|
||||||
|
# get tf_model graph by passing an input
|
||||||
|
# B x D x T
|
||||||
|
dummy_input = tf.random.uniform((7, 80, 64), dtype=tf.float32)
|
||||||
|
mel_pred = model_tf(dummy_input, training=False)
|
||||||
|
|
||||||
|
# get tf variables
|
||||||
|
tf_vars = model_tf.weights
|
||||||
|
|
||||||
|
# match variable names with fuzzy logic
|
||||||
|
torch_var_names = list(state_dict.keys())
|
||||||
|
tf_var_names = [we.name for we in model_tf.weights]
|
||||||
|
var_map = []
|
||||||
|
for tf_name in tf_var_names:
|
||||||
|
# skip re-mapped layer names
|
||||||
|
if tf_name in [name[0] for name in var_map]:
|
||||||
|
continue
|
||||||
|
tf_name_edited = convert_tf_name(tf_name)
|
||||||
|
ratios = [
|
||||||
|
fuzz.ratio(torch_name, tf_name_edited)
|
||||||
|
for torch_name in torch_var_names
|
||||||
|
]
|
||||||
|
max_idx = np.argmax(ratios)
|
||||||
|
matching_name = torch_var_names[max_idx]
|
||||||
|
del torch_var_names[max_idx]
|
||||||
|
var_map.append((tf_name, matching_name))
|
||||||
|
|
||||||
|
# pass weights
|
||||||
|
tf_vars = transfer_weights_torch_to_tf(tf_vars, dict(var_map), state_dict)
|
||||||
|
|
||||||
|
# Compare TF and TORCH models
|
||||||
|
# check embedding outputs
|
||||||
|
model.eval()
|
||||||
|
dummy_input_torch = torch.ones((1, 80, 10))
|
||||||
|
dummy_input_tf = tf.convert_to_tensor(dummy_input_torch.numpy())
|
||||||
|
dummy_input_tf = tf.transpose(dummy_input_tf, perm=[0, 2, 1])
|
||||||
|
dummy_input_tf = tf.expand_dims(dummy_input_tf, 2)
|
||||||
|
|
||||||
|
out_torch = model.layers[0](dummy_input_torch)
|
||||||
|
out_tf = model_tf.model_layers[0](dummy_input_tf)
|
||||||
|
out_tf_ = tf.transpose(out_tf, perm=[0, 3, 2, 1])[:, :, 0, :]
|
||||||
|
|
||||||
|
assert compare_torch_tf(out_torch, out_tf_) < 1e-5
|
||||||
|
|
||||||
|
for i in range(1, len(model.layers)):
|
||||||
|
print(f"{i} -> {model.layers[i]} vs {model_tf.model_layers[i]}")
|
||||||
|
out_torch = model.layers[i](out_torch)
|
||||||
|
out_tf = model_tf.model_layers[i](out_tf)
|
||||||
|
out_tf_ = tf.transpose(out_tf, perm=[0, 3, 2, 1])[:, :, 0, :]
|
||||||
|
diff = compare_torch_tf(out_torch, out_tf_)
|
||||||
|
assert diff < 1e-5, diff
|
||||||
|
|
||||||
|
dummy_input_torch = torch.ones((1, 80, 10))
|
||||||
|
dummy_input_tf = tf.convert_to_tensor(dummy_input_torch.numpy())
|
||||||
|
output_torch = model.inference(dummy_input_torch)
|
||||||
|
output_tf = model_tf(dummy_input_tf, training=False)
|
||||||
|
assert compare_torch_tf(output_torch, output_tf) < 1e-5, compare_torch_tf(
|
||||||
|
output_torch, output_tf)
|
||||||
|
# save tf model
|
||||||
|
save_checkpoint(model_tf, checkpoint['step'], checkpoint['epoch'],
|
||||||
|
args.output_path)
|
||||||
|
print(' > Model conversion is successfully completed :).')
|
|
@ -0,0 +1,58 @@
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
|
||||||
|
class ReflectionPad1d(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, padding):
|
||||||
|
super(ReflectionPad1d, self).__init__()
|
||||||
|
self.padding = padding
|
||||||
|
|
||||||
|
def call(self, x):
|
||||||
|
print(x.shape)
|
||||||
|
return tf.pad(x, [[0, 0], [self.padding, self.padding], [0, 0], [0, 0]], "REFLECT")
|
||||||
|
|
||||||
|
|
||||||
|
class ResidualStack(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, channels, num_res_blocks, kernel_size, name):
|
||||||
|
super(ResidualStack, self).__init__(name=name)
|
||||||
|
|
||||||
|
assert (kernel_size - 1) % 2 == 0, " [!] kernel_size has to be odd."
|
||||||
|
base_padding = (kernel_size - 1) // 2
|
||||||
|
|
||||||
|
self.blocks = []
|
||||||
|
num_layers = 2
|
||||||
|
for idx in range(num_res_blocks):
|
||||||
|
layer_kernel_size = kernel_size
|
||||||
|
layer_dilation = layer_kernel_size**idx
|
||||||
|
layer_padding = base_padding * layer_dilation
|
||||||
|
block = [
|
||||||
|
tf.keras.layers.LeakyReLU(0.2),
|
||||||
|
ReflectionPad1d(layer_padding),
|
||||||
|
tf.keras.layers.Conv2D(filters=channels,
|
||||||
|
kernel_size=(kernel_size, 1),
|
||||||
|
dilation_rate=(layer_dilation, 1),
|
||||||
|
use_bias=True,
|
||||||
|
padding='valid',
|
||||||
|
name=f'blocks.{idx}.{num_layers}'),
|
||||||
|
tf.keras.layers.LeakyReLU(0.2),
|
||||||
|
tf.keras.layers.Conv2D(filters=channels,
|
||||||
|
kernel_size=(1, 1),
|
||||||
|
use_bias=True,
|
||||||
|
name=f'blocks.{idx}.{num_layers + 2}')
|
||||||
|
]
|
||||||
|
self.blocks.append(block)
|
||||||
|
self.shortcuts = [
|
||||||
|
tf.keras.layers.Conv2D(channels,
|
||||||
|
kernel_size=1,
|
||||||
|
use_bias=True,
|
||||||
|
name=f'shortcuts.{i}')
|
||||||
|
for i in range(num_res_blocks)
|
||||||
|
]
|
||||||
|
|
||||||
|
def call(self, x):
|
||||||
|
# breakpoint()
|
||||||
|
for block, shortcut in zip(self.blocks, self.shortcuts):
|
||||||
|
res = shortcut(x)
|
||||||
|
for layer in block:
|
||||||
|
x = layer(x)
|
||||||
|
x += res
|
||||||
|
return x
|
|
@ -0,0 +1,66 @@
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
from scipy import signal as sig
|
||||||
|
|
||||||
|
|
||||||
|
class PQMF(tf.keras.layers.Layer):
|
||||||
|
def __init__(self, N=4, taps=62, cutoff=0.15, beta=9.0):
|
||||||
|
super(PQMF, self).__init__()
|
||||||
|
# define filter coefficient
|
||||||
|
self.N = N
|
||||||
|
self.taps = taps
|
||||||
|
self.cutoff = cutoff
|
||||||
|
self.beta = beta
|
||||||
|
|
||||||
|
QMF = sig.firwin(taps + 1, cutoff, window=('kaiser', beta))
|
||||||
|
H = np.zeros((N, len(QMF)))
|
||||||
|
G = np.zeros((N, len(QMF)))
|
||||||
|
for k in range(N):
|
||||||
|
constant_factor = (2 * k + 1) * (np.pi /
|
||||||
|
(2 * N)) * (np.arange(taps + 1) -
|
||||||
|
((taps - 1) / 2))
|
||||||
|
phase = (-1)**k * np.pi / 4
|
||||||
|
H[k] = 2 * QMF * np.cos(constant_factor + phase)
|
||||||
|
|
||||||
|
G[k] = 2 * QMF * np.cos(constant_factor - phase)
|
||||||
|
|
||||||
|
# [N, 1, taps + 1] == [filter_width, in_channels, out_channels]
|
||||||
|
self.H = np.transpose(H[:, None, :], (2, 1, 0)).astype('float32')
|
||||||
|
self.G = np.transpose(G[None, :, :], (2, 1, 0)).astype('float32')
|
||||||
|
|
||||||
|
# filter for downsampling & upsampling
|
||||||
|
updown_filter = np.zeros((N, N, N), dtype=np.float32)
|
||||||
|
for k in range(N):
|
||||||
|
updown_filter[0, k, k] = 1.0
|
||||||
|
self.updown_filter = updown_filter.astype(np.float32)
|
||||||
|
|
||||||
|
def analysis(self, x):
|
||||||
|
"""
|
||||||
|
x : B x 1 x T
|
||||||
|
"""
|
||||||
|
x = tf.transpose(x, perm=[0, 2, 1])
|
||||||
|
x = tf.pad(x, [[0, 0], [self.taps // 2, self.taps // 2], [0, 0]], constant_values=0.0)
|
||||||
|
x = tf.nn.conv1d(x, self.H, stride=1, padding='VALID')
|
||||||
|
x = tf.nn.conv1d(x,
|
||||||
|
self.updown_filter,
|
||||||
|
stride=self.N,
|
||||||
|
padding='VALID')
|
||||||
|
x = tf.transpose(x, perm=[0, 2, 1])
|
||||||
|
return x
|
||||||
|
|
||||||
|
def synthesis(self, x):
|
||||||
|
"""
|
||||||
|
x : B x 1 x T
|
||||||
|
"""
|
||||||
|
x = tf.transpose(x, perm=[0, 2, 1])
|
||||||
|
x = tf.nn.conv1d_transpose(
|
||||||
|
x,
|
||||||
|
self.updown_filter * self.N,
|
||||||
|
strides=self.N,
|
||||||
|
output_shape=(tf.shape(x)[0], tf.shape(x)[1] * self.N,
|
||||||
|
self.N))
|
||||||
|
x = tf.pad(x, [[0, 0], [self.taps // 2, self.taps // 2], [0, 0]], constant_values=0.0)
|
||||||
|
x = tf.nn.conv1d(x, self.G, stride=1, padding="VALID")
|
||||||
|
x = tf.transpose(x, perm=[0, 2, 1])
|
||||||
|
return x
|
|
@ -0,0 +1,103 @@
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # FATAL
|
||||||
|
logging.getLogger('tensorflow').setLevel(logging.FATAL)
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
from TTS.vocoder.tf.layers.melgan import ResidualStack, ReflectionPad1d
|
||||||
|
|
||||||
|
|
||||||
|
class MelganGenerator(tf.keras.models.Model): # pylint: disable=too-many-ancestors
|
||||||
|
""" Melgan Generator TF implementation dedicated for inference with no
|
||||||
|
weight norm """
|
||||||
|
def __init__(self,
|
||||||
|
in_channels=80,
|
||||||
|
out_channels=1,
|
||||||
|
proj_kernel=7,
|
||||||
|
base_channels=512,
|
||||||
|
upsample_factors=(8, 8, 2, 2),
|
||||||
|
res_kernel=3,
|
||||||
|
num_res_blocks=3):
|
||||||
|
super(MelganGenerator, self).__init__()
|
||||||
|
|
||||||
|
self.in_channels = in_channels
|
||||||
|
|
||||||
|
# assert model parameters
|
||||||
|
assert (proj_kernel -
|
||||||
|
1) % 2 == 0, " [!] proj_kernel should be an odd number."
|
||||||
|
|
||||||
|
# setup additional model parameters
|
||||||
|
base_padding = (proj_kernel - 1) // 2
|
||||||
|
act_slope = 0.2
|
||||||
|
self.inference_padding = 2
|
||||||
|
|
||||||
|
# initial layer
|
||||||
|
self.initial_layer = [
|
||||||
|
ReflectionPad1d(base_padding),
|
||||||
|
tf.keras.layers.Conv2D(filters=base_channels,
|
||||||
|
kernel_size=(proj_kernel, 1),
|
||||||
|
strides=1,
|
||||||
|
padding='valid',
|
||||||
|
use_bias=True,
|
||||||
|
name="1")
|
||||||
|
]
|
||||||
|
num_layers = 3 # count number of layers for layer naming
|
||||||
|
|
||||||
|
# upsampling layers and residual stacks
|
||||||
|
self.upsample_layers = []
|
||||||
|
for idx, upsample_factor in enumerate(upsample_factors):
|
||||||
|
layer_out_channels = base_channels // (2**(idx + 1))
|
||||||
|
layer_filter_size = upsample_factor * 2
|
||||||
|
layer_stride = upsample_factor
|
||||||
|
# layer_output_padding = upsample_factor % 2
|
||||||
|
self.upsample_layers += [
|
||||||
|
tf.keras.layers.LeakyReLU(act_slope),
|
||||||
|
tf.keras.layers.Conv2DTranspose(
|
||||||
|
filters=layer_out_channels,
|
||||||
|
kernel_size=(layer_filter_size, 1),
|
||||||
|
strides=(layer_stride, 1),
|
||||||
|
padding='same',
|
||||||
|
# output_padding=layer_output_padding,
|
||||||
|
use_bias=True,
|
||||||
|
name=f'{num_layers}'),
|
||||||
|
ResidualStack(channels=layer_out_channels,
|
||||||
|
num_res_blocks=num_res_blocks,
|
||||||
|
kernel_size=res_kernel,
|
||||||
|
name=f'layers.{num_layers + 1}')
|
||||||
|
]
|
||||||
|
num_layers += num_res_blocks - 1
|
||||||
|
|
||||||
|
self.upsample_layers += [tf.keras.layers.LeakyReLU(act_slope)]
|
||||||
|
|
||||||
|
# final layer
|
||||||
|
self.final_layers = [
|
||||||
|
ReflectionPad1d(base_padding),
|
||||||
|
tf.keras.layers.Conv2D(filters=out_channels,
|
||||||
|
kernel_size=(proj_kernel, 1),
|
||||||
|
use_bias=True,
|
||||||
|
name=f'layers.{num_layers + 1}'),
|
||||||
|
tf.keras.layers.Activation("tanh")
|
||||||
|
]
|
||||||
|
|
||||||
|
# self.model_layers = tf.keras.models.Sequential(self.initial_layer + self.upsample_layers + self.final_layers, name="layers")
|
||||||
|
self.model_layers = self.initial_layer + self.upsample_layers + self.final_layers
|
||||||
|
|
||||||
|
def call(self, c, training=False):
|
||||||
|
"""
|
||||||
|
c : B x C x T
|
||||||
|
"""
|
||||||
|
if training:
|
||||||
|
raise NotImplementedError()
|
||||||
|
return self.inference(c)
|
||||||
|
|
||||||
|
def inference(self, c):
|
||||||
|
c = tf.transpose(c, perm=[0, 2, 1])
|
||||||
|
c = tf.expand_dims(c, 2)
|
||||||
|
c = tf.pad(c, [[0, 0], [self.inference_padding, self.inference_padding], [0, 0], [0, 0]], "REFLECT")
|
||||||
|
o = c
|
||||||
|
for layer in self.model_layers:
|
||||||
|
o = layer(o)
|
||||||
|
# o = self.model_layers(c)
|
||||||
|
o = tf.transpose(o, perm=[0, 3, 2, 1])
|
||||||
|
return o[:, :, 0, :]
|
|
@ -0,0 +1,46 @@
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
from TTS.vocoder.tf.models.melgan_generator import MelganGenerator
|
||||||
|
from TTS.vocoder.tf.layers.pqmf import PQMF
|
||||||
|
|
||||||
|
|
||||||
|
class MultibandMelganGenerator(MelganGenerator): # pylint: disable=too-many-ancestors
|
||||||
|
def __init__(self,
|
||||||
|
in_channels=80,
|
||||||
|
out_channels=4,
|
||||||
|
proj_kernel=7,
|
||||||
|
base_channels=384,
|
||||||
|
upsample_factors=(2, 8, 2, 2),
|
||||||
|
res_kernel=3,
|
||||||
|
num_res_blocks=3):
|
||||||
|
super(MultibandMelganGenerator,
|
||||||
|
self).__init__(in_channels=in_channels,
|
||||||
|
out_channels=out_channels,
|
||||||
|
proj_kernel=proj_kernel,
|
||||||
|
base_channels=base_channels,
|
||||||
|
upsample_factors=upsample_factors,
|
||||||
|
res_kernel=res_kernel,
|
||||||
|
num_res_blocks=num_res_blocks)
|
||||||
|
self.pqmf_layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0)
|
||||||
|
|
||||||
|
def pqmf_analysis(self, x):
|
||||||
|
return self.pqmf_layer.analysis(x)
|
||||||
|
|
||||||
|
def pqmf_synthesis(self, x):
|
||||||
|
return self.pqmf_layer.synthesis(x)
|
||||||
|
|
||||||
|
# def call(self, c, training=False):
|
||||||
|
# if training:
|
||||||
|
# raise NotImplementedError()
|
||||||
|
# return self.inference(c)
|
||||||
|
|
||||||
|
def inference(self, c):
|
||||||
|
c = tf.transpose(c, perm=[0, 2, 1])
|
||||||
|
c = tf.expand_dims(c, 2)
|
||||||
|
c = tf.pad(c, [[0, 0], [self.inference_padding, self.inference_padding], [0, 0], [0, 0]], "REFLECT")
|
||||||
|
o = c
|
||||||
|
for layer in self.model_layers:
|
||||||
|
o = layer(o)
|
||||||
|
o = tf.transpose(o, perm=[0, 3, 2, 1])
|
||||||
|
o = self.pqmf_layer.synthesis(o[:, :, 0, :])
|
||||||
|
return o
|
|
@ -0,0 +1,45 @@
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
|
||||||
|
def compare_torch_tf(torch_tensor, tf_tensor):
|
||||||
|
""" Compute the average absolute difference b/w torch and tf tensors """
|
||||||
|
return abs(torch_tensor.detach().numpy() - tf_tensor.numpy()).mean()
|
||||||
|
|
||||||
|
|
||||||
|
def convert_tf_name(tf_name):
|
||||||
|
""" Convert certain patterns in TF layer names to Torch patterns """
|
||||||
|
tf_name_tmp = tf_name
|
||||||
|
tf_name_tmp = tf_name_tmp.replace(':0', '')
|
||||||
|
tf_name_tmp = tf_name_tmp.replace('/forward_lstm/lstm_cell_1/recurrent_kernel', '/weight_hh_l0')
|
||||||
|
tf_name_tmp = tf_name_tmp.replace('/forward_lstm/lstm_cell_2/kernel', '/weight_ih_l1')
|
||||||
|
tf_name_tmp = tf_name_tmp.replace('/recurrent_kernel', '/weight_hh')
|
||||||
|
tf_name_tmp = tf_name_tmp.replace('/kernel', '/weight')
|
||||||
|
tf_name_tmp = tf_name_tmp.replace('/gamma', '/weight')
|
||||||
|
tf_name_tmp = tf_name_tmp.replace('/beta', '/bias')
|
||||||
|
tf_name_tmp = tf_name_tmp.replace('/', '.')
|
||||||
|
return tf_name_tmp
|
||||||
|
|
||||||
|
|
||||||
|
def transfer_weights_torch_to_tf(tf_vars, var_map_dict, state_dict):
|
||||||
|
""" Transfer weigths from torch state_dict to TF variables """
|
||||||
|
print(" > Passing weights from Torch to TF ...")
|
||||||
|
for tf_var in tf_vars:
|
||||||
|
torch_var_name = var_map_dict[tf_var.name]
|
||||||
|
print(f' | > {tf_var.name} <-- {torch_var_name}')
|
||||||
|
# if tuple, it is a bias variable
|
||||||
|
if 'kernel' in tf_var.name:
|
||||||
|
torch_weight = state_dict[torch_var_name]
|
||||||
|
numpy_weight = torch_weight.permute([2, 1, 0]).numpy()[:, None, :, :]
|
||||||
|
if 'bias' in tf_var.name:
|
||||||
|
torch_weight = state_dict[torch_var_name]
|
||||||
|
numpy_weight = torch_weight
|
||||||
|
assert np.all(tf_var.shape == numpy_weight.shape), f" [!] weight shapes does not match: {tf_var.name} vs {torch_var_name} --> {tf_var.shape} vs {numpy_weight.shape}"
|
||||||
|
tf.keras.backend.set_value(tf_var, numpy_weight)
|
||||||
|
return tf_vars
|
||||||
|
|
||||||
|
|
||||||
|
def load_tf_vars(model_tf, tf_vars):
|
||||||
|
for tf_var in tf_vars:
|
||||||
|
model_tf.get_layer(tf_var.name).set_weights(tf_var)
|
||||||
|
return model_tf
|
|
@ -0,0 +1,35 @@
|
||||||
|
import re
|
||||||
|
import importlib
|
||||||
|
|
||||||
|
|
||||||
|
def to_camel(text):
|
||||||
|
text = text.capitalize()
|
||||||
|
return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text)
|
||||||
|
|
||||||
|
|
||||||
|
def setup_generator(c):
|
||||||
|
print(" > Generator Model: {}".format(c.generator_model))
|
||||||
|
MyModel = importlib.import_module('TTS.vocoder.tf.models.' +
|
||||||
|
c.generator_model.lower())
|
||||||
|
MyModel = getattr(MyModel, to_camel(c.generator_model))
|
||||||
|
if c.generator_model in 'melgan_generator':
|
||||||
|
model = MyModel(
|
||||||
|
in_channels=c.audio['num_mels'],
|
||||||
|
out_channels=1,
|
||||||
|
proj_kernel=7,
|
||||||
|
base_channels=512,
|
||||||
|
upsample_factors=c.generator_model_params['upsample_factors'],
|
||||||
|
res_kernel=3,
|
||||||
|
num_res_blocks=c.generator_model_params['num_res_blocks'])
|
||||||
|
if c.generator_model in 'melgan_fb_generator':
|
||||||
|
pass
|
||||||
|
if c.generator_model in 'multiband_melgan_generator':
|
||||||
|
model = MyModel(
|
||||||
|
in_channels=c.audio['num_mels'],
|
||||||
|
out_channels=4,
|
||||||
|
proj_kernel=7,
|
||||||
|
base_channels=384,
|
||||||
|
upsample_factors=c.generator_model_params['upsample_factors'],
|
||||||
|
res_kernel=3,
|
||||||
|
num_res_blocks=c.generator_model_params['num_res_blocks'])
|
||||||
|
return model
|
|
@ -0,0 +1,27 @@
|
||||||
|
import datetime
|
||||||
|
import pickle
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
|
||||||
|
def save_checkpoint(model, current_step, epoch, output_path, **kwargs):
|
||||||
|
""" Save TF Vocoder model """
|
||||||
|
state = {
|
||||||
|
'model': model.weights,
|
||||||
|
'step': current_step,
|
||||||
|
'epoch': epoch,
|
||||||
|
'date': datetime.date.today().strftime("%B %d, %Y"),
|
||||||
|
}
|
||||||
|
state.update(kwargs)
|
||||||
|
pickle.dump(state, open(output_path, 'wb'))
|
||||||
|
|
||||||
|
|
||||||
|
def load_checkpoint(model, checkpoint_path):
|
||||||
|
""" Load TF Vocoder model """
|
||||||
|
checkpoint = pickle.load(open(checkpoint_path, 'rb'))
|
||||||
|
chkp_var_dict = {var.name: var.numpy() for var in checkpoint['model']}
|
||||||
|
tf_vars = model.weights
|
||||||
|
for tf_var in tf_vars:
|
||||||
|
layer_name = tf_var.name
|
||||||
|
chkp_var_value = chkp_var_dict[layer_name]
|
||||||
|
tf.keras.backend.set_value(tf_var, chkp_var_value)
|
||||||
|
return model
|
|
@ -0,0 +1,658 @@
|
||||||
|
import argparse
|
||||||
|
import glob
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
|
from inspect import signature
|
||||||
|
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
from TTS.utils.generic_utils import (KeepAverage, count_parameters,
|
||||||
|
create_experiment_folder, get_git_branch,
|
||||||
|
remove_experiment_folder, set_init_dict)
|
||||||
|
from TTS.utils.io import copy_config_file, load_config
|
||||||
|
from TTS.utils.radam import RAdam
|
||||||
|
from TTS.utils.tensorboard_logger import TensorboardLogger
|
||||||
|
from TTS.utils.training import setup_torch_training_env
|
||||||
|
from TTS.vocoder.datasets.gan_dataset import GANDataset
|
||||||
|
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
|
||||||
|
# from distribute import (DistributedSampler, apply_gradient_allreduce,
|
||||||
|
# init_distributed, reduce_tensor)
|
||||||
|
from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss
|
||||||
|
from TTS.vocoder.utils.io import save_checkpoint, save_best_model
|
||||||
|
from TTS.vocoder.utils.console_logger import ConsoleLogger
|
||||||
|
from TTS.vocoder.utils.generic_utils import (check_config, plot_results,
|
||||||
|
setup_discriminator,
|
||||||
|
setup_generator)
|
||||||
|
|
||||||
|
|
||||||
|
use_cuda, num_gpus = setup_torch_training_env(True, True)
|
||||||
|
|
||||||
|
|
||||||
|
def setup_loader(ap, is_val=False, verbose=False):
|
||||||
|
if is_val and not c.run_eval:
|
||||||
|
loader = None
|
||||||
|
else:
|
||||||
|
dataset = GANDataset(ap=ap,
|
||||||
|
items=eval_data if is_val else train_data,
|
||||||
|
seq_len=c.seq_len,
|
||||||
|
hop_len=ap.hop_length,
|
||||||
|
pad_short=c.pad_short,
|
||||||
|
conv_pad=c.conv_pad,
|
||||||
|
is_training=not is_val,
|
||||||
|
return_segments=not is_val,
|
||||||
|
use_noise_augment=c.use_noise_augment,
|
||||||
|
use_cache=c.use_cache,
|
||||||
|
verbose=verbose)
|
||||||
|
dataset.shuffle_mapping()
|
||||||
|
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None
|
||||||
|
loader = DataLoader(dataset,
|
||||||
|
batch_size=1 if is_val else c.batch_size,
|
||||||
|
shuffle=True,
|
||||||
|
drop_last=False,
|
||||||
|
sampler=None,
|
||||||
|
num_workers=c.num_val_loader_workers
|
||||||
|
if is_val else c.num_loader_workers,
|
||||||
|
pin_memory=False)
|
||||||
|
return loader
|
||||||
|
|
||||||
|
|
||||||
|
def format_data(data):
|
||||||
|
if isinstance(data[0], list):
|
||||||
|
# setup input data
|
||||||
|
c_G, x_G = data[0]
|
||||||
|
c_D, x_D = data[1]
|
||||||
|
|
||||||
|
# dispatch data to GPU
|
||||||
|
if use_cuda:
|
||||||
|
c_G = c_G.cuda(non_blocking=True)
|
||||||
|
x_G = x_G.cuda(non_blocking=True)
|
||||||
|
c_D = c_D.cuda(non_blocking=True)
|
||||||
|
x_D = x_D.cuda(non_blocking=True)
|
||||||
|
|
||||||
|
return c_G, x_G, c_D, x_D
|
||||||
|
|
||||||
|
# return a whole audio segment
|
||||||
|
co, x = data
|
||||||
|
if use_cuda:
|
||||||
|
co = co.cuda(non_blocking=True)
|
||||||
|
x = x.cuda(non_blocking=True)
|
||||||
|
return co, x, None, None
|
||||||
|
|
||||||
|
|
||||||
|
def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
|
||||||
|
scheduler_G, scheduler_D, ap, global_step, epoch):
|
||||||
|
data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0))
|
||||||
|
model_G.train()
|
||||||
|
model_D.train()
|
||||||
|
epoch_time = 0
|
||||||
|
keep_avg = KeepAverage()
|
||||||
|
if use_cuda:
|
||||||
|
batch_n_iter = int(
|
||||||
|
len(data_loader.dataset) / (c.batch_size * num_gpus))
|
||||||
|
else:
|
||||||
|
batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
|
||||||
|
end_time = time.time()
|
||||||
|
c_logger.print_train_start()
|
||||||
|
for num_iter, data in enumerate(data_loader):
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# format data
|
||||||
|
c_G, y_G, c_D, y_D = format_data(data)
|
||||||
|
loader_time = time.time() - end_time
|
||||||
|
|
||||||
|
global_step += 1
|
||||||
|
|
||||||
|
##############################
|
||||||
|
# GENERATOR
|
||||||
|
##############################
|
||||||
|
|
||||||
|
# generator pass
|
||||||
|
y_hat = model_G(c_G)
|
||||||
|
y_hat_sub = None
|
||||||
|
y_G_sub = None
|
||||||
|
y_hat_vis = y_hat # for visualization
|
||||||
|
|
||||||
|
# PQMF formatting
|
||||||
|
if y_hat.shape[1] > 1:
|
||||||
|
y_hat_sub = y_hat
|
||||||
|
y_hat = model_G.pqmf_synthesis(y_hat)
|
||||||
|
y_hat_vis = y_hat
|
||||||
|
y_G_sub = model_G.pqmf_analysis(y_G)
|
||||||
|
|
||||||
|
if global_step > c.steps_to_start_discriminator:
|
||||||
|
|
||||||
|
# run D with or without cond. features
|
||||||
|
if len(signature(model_D.forward).parameters) == 2:
|
||||||
|
D_out_fake = model_D(y_hat, c_G)
|
||||||
|
else:
|
||||||
|
D_out_fake = model_D(y_hat)
|
||||||
|
D_out_real = None
|
||||||
|
|
||||||
|
if c.use_feat_match_loss:
|
||||||
|
with torch.no_grad():
|
||||||
|
D_out_real = model_D(y_G)
|
||||||
|
|
||||||
|
# format D outputs
|
||||||
|
if isinstance(D_out_fake, tuple):
|
||||||
|
scores_fake, feats_fake = D_out_fake
|
||||||
|
if D_out_real is None:
|
||||||
|
feats_real = None
|
||||||
|
else:
|
||||||
|
_, feats_real = D_out_real
|
||||||
|
else:
|
||||||
|
scores_fake = D_out_fake
|
||||||
|
else:
|
||||||
|
scores_fake, feats_fake, feats_real = None, None, None
|
||||||
|
|
||||||
|
# compute losses
|
||||||
|
loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake,
|
||||||
|
feats_real, y_hat_sub, y_G_sub)
|
||||||
|
loss_G = loss_G_dict['G_loss']
|
||||||
|
|
||||||
|
# optimizer generator
|
||||||
|
optimizer_G.zero_grad()
|
||||||
|
loss_G.backward()
|
||||||
|
if c.gen_clip_grad > 0:
|
||||||
|
torch.nn.utils.clip_grad_norm_(model_G.parameters(),
|
||||||
|
c.gen_clip_grad)
|
||||||
|
optimizer_G.step()
|
||||||
|
if scheduler_G is not None:
|
||||||
|
scheduler_G.step()
|
||||||
|
|
||||||
|
loss_dict = dict()
|
||||||
|
for key, value in loss_G_dict.items():
|
||||||
|
if isinstance(value, int):
|
||||||
|
loss_dict[key] = value
|
||||||
|
else:
|
||||||
|
loss_dict[key] = value.item()
|
||||||
|
|
||||||
|
##############################
|
||||||
|
# DISCRIMINATOR
|
||||||
|
##############################
|
||||||
|
if global_step >= c.steps_to_start_discriminator:
|
||||||
|
# discriminator pass
|
||||||
|
with torch.no_grad():
|
||||||
|
y_hat = model_G(c_D)
|
||||||
|
|
||||||
|
# PQMF formatting
|
||||||
|
if y_hat.shape[1] > 1:
|
||||||
|
y_hat = model_G.pqmf_synthesis(y_hat)
|
||||||
|
|
||||||
|
# run D with or without cond. features
|
||||||
|
if len(signature(model_D.forward).parameters) == 2:
|
||||||
|
D_out_fake = model_D(y_hat.detach(), c_D)
|
||||||
|
D_out_real = model_D(y_D, c_D)
|
||||||
|
else:
|
||||||
|
D_out_fake = model_D(y_hat.detach())
|
||||||
|
D_out_real = model_D(y_D)
|
||||||
|
|
||||||
|
# format D outputs
|
||||||
|
if isinstance(D_out_fake, tuple):
|
||||||
|
scores_fake, feats_fake = D_out_fake
|
||||||
|
if D_out_real is None:
|
||||||
|
scores_real, feats_real = None, None
|
||||||
|
else:
|
||||||
|
scores_real, feats_real = D_out_real
|
||||||
|
else:
|
||||||
|
scores_fake = D_out_fake
|
||||||
|
scores_real = D_out_real
|
||||||
|
|
||||||
|
# compute losses
|
||||||
|
loss_D_dict = criterion_D(scores_fake, scores_real)
|
||||||
|
loss_D = loss_D_dict['D_loss']
|
||||||
|
|
||||||
|
# optimizer discriminator
|
||||||
|
optimizer_D.zero_grad()
|
||||||
|
loss_D.backward()
|
||||||
|
if c.disc_clip_grad > 0:
|
||||||
|
torch.nn.utils.clip_grad_norm_(model_D.parameters(),
|
||||||
|
c.disc_clip_grad)
|
||||||
|
optimizer_D.step()
|
||||||
|
if scheduler_D is not None:
|
||||||
|
scheduler_D.step()
|
||||||
|
|
||||||
|
for key, value in loss_D_dict.items():
|
||||||
|
if isinstance(value, (int, float)):
|
||||||
|
loss_dict[key] = value
|
||||||
|
else:
|
||||||
|
loss_dict[key] = value.item()
|
||||||
|
|
||||||
|
step_time = time.time() - start_time
|
||||||
|
epoch_time += step_time
|
||||||
|
|
||||||
|
# get current learning rates
|
||||||
|
current_lr_G = list(optimizer_G.param_groups)[0]['lr']
|
||||||
|
current_lr_D = list(optimizer_D.param_groups)[0]['lr']
|
||||||
|
|
||||||
|
# update avg stats
|
||||||
|
update_train_values = dict()
|
||||||
|
for key, value in loss_dict.items():
|
||||||
|
update_train_values['avg_' + key] = value
|
||||||
|
update_train_values['avg_loader_time'] = loader_time
|
||||||
|
update_train_values['avg_step_time'] = step_time
|
||||||
|
keep_avg.update_values(update_train_values)
|
||||||
|
|
||||||
|
# print training stats
|
||||||
|
if global_step % c.print_step == 0:
|
||||||
|
c_logger.print_train_step(batch_n_iter, num_iter, global_step,
|
||||||
|
step_time, loader_time, current_lr_G,
|
||||||
|
current_lr_D, loss_dict,
|
||||||
|
keep_avg.avg_values)
|
||||||
|
|
||||||
|
# plot step stats
|
||||||
|
if global_step % 10 == 0:
|
||||||
|
iter_stats = {
|
||||||
|
"lr_G": current_lr_G,
|
||||||
|
"lr_D": current_lr_D,
|
||||||
|
"step_time": step_time
|
||||||
|
}
|
||||||
|
iter_stats.update(loss_dict)
|
||||||
|
tb_logger.tb_train_iter_stats(global_step, iter_stats)
|
||||||
|
|
||||||
|
# save checkpoint
|
||||||
|
if global_step % c.save_step == 0:
|
||||||
|
if c.checkpoint:
|
||||||
|
# save model
|
||||||
|
save_checkpoint(model_G,
|
||||||
|
optimizer_G,
|
||||||
|
scheduler_G,
|
||||||
|
model_D,
|
||||||
|
optimizer_D,
|
||||||
|
scheduler_D,
|
||||||
|
global_step,
|
||||||
|
epoch,
|
||||||
|
OUT_PATH,
|
||||||
|
model_losses=loss_dict)
|
||||||
|
|
||||||
|
# compute spectrograms
|
||||||
|
figures = plot_results(y_hat_vis, y_G, ap, global_step,
|
||||||
|
'train')
|
||||||
|
tb_logger.tb_train_figures(global_step, figures)
|
||||||
|
|
||||||
|
# Sample audio
|
||||||
|
sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy()
|
||||||
|
tb_logger.tb_train_audios(global_step,
|
||||||
|
{'train/audio': sample_voice},
|
||||||
|
c.audio["sample_rate"])
|
||||||
|
end_time = time.time()
|
||||||
|
|
||||||
|
# print epoch stats
|
||||||
|
c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg)
|
||||||
|
|
||||||
|
# Plot Training Epoch Stats
|
||||||
|
epoch_stats = {"epoch_time": epoch_time}
|
||||||
|
epoch_stats.update(keep_avg.avg_values)
|
||||||
|
tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
|
||||||
|
# TODO: plot model stats
|
||||||
|
# if c.tb_model_param_stats:
|
||||||
|
# tb_logger.tb_model_weights(model, global_step)
|
||||||
|
return keep_avg.avg_values, global_step
|
||||||
|
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch):
|
||||||
|
data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0))
|
||||||
|
model_G.eval()
|
||||||
|
model_D.eval()
|
||||||
|
epoch_time = 0
|
||||||
|
keep_avg = KeepAverage()
|
||||||
|
end_time = time.time()
|
||||||
|
c_logger.print_eval_start()
|
||||||
|
for num_iter, data in enumerate(data_loader):
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# format data
|
||||||
|
c_G, y_G, _, _ = format_data(data)
|
||||||
|
loader_time = time.time() - end_time
|
||||||
|
|
||||||
|
global_step += 1
|
||||||
|
|
||||||
|
##############################
|
||||||
|
# GENERATOR
|
||||||
|
##############################
|
||||||
|
|
||||||
|
# generator pass
|
||||||
|
y_hat = model_G(c_G)
|
||||||
|
y_hat_sub = None
|
||||||
|
y_G_sub = None
|
||||||
|
|
||||||
|
# PQMF formatting
|
||||||
|
if y_hat.shape[1] > 1:
|
||||||
|
y_hat_sub = y_hat
|
||||||
|
y_hat = model_G.pqmf_synthesis(y_hat)
|
||||||
|
y_G_sub = model_G.pqmf_analysis(y_G)
|
||||||
|
|
||||||
|
|
||||||
|
if global_step > c.steps_to_start_discriminator:
|
||||||
|
|
||||||
|
if len(signature(model_D.forward).parameters) == 2:
|
||||||
|
D_out_fake = model_D(y_hat, c_G)
|
||||||
|
else:
|
||||||
|
D_out_fake = model_D(y_hat)
|
||||||
|
D_out_real = None
|
||||||
|
|
||||||
|
if c.use_feat_match_loss:
|
||||||
|
with torch.no_grad():
|
||||||
|
D_out_real = model_D(y_G)
|
||||||
|
|
||||||
|
# format D outputs
|
||||||
|
if isinstance(D_out_fake, tuple):
|
||||||
|
scores_fake, feats_fake = D_out_fake
|
||||||
|
if D_out_real is None:
|
||||||
|
feats_real = None
|
||||||
|
else:
|
||||||
|
_, feats_real = D_out_real
|
||||||
|
else:
|
||||||
|
scores_fake = D_out_fake
|
||||||
|
else:
|
||||||
|
scores_fake, feats_fake, feats_real = None, None, None
|
||||||
|
|
||||||
|
# compute losses
|
||||||
|
loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake,
|
||||||
|
feats_real, y_hat_sub, y_G_sub)
|
||||||
|
|
||||||
|
loss_dict = dict()
|
||||||
|
for key, value in loss_G_dict.items():
|
||||||
|
if isinstance(value, (int, float)):
|
||||||
|
loss_dict[key] = value
|
||||||
|
else:
|
||||||
|
loss_dict[key] = value.item()
|
||||||
|
|
||||||
|
##############################
|
||||||
|
# DISCRIMINATOR
|
||||||
|
##############################
|
||||||
|
|
||||||
|
if global_step >= c.steps_to_start_discriminator:
|
||||||
|
# discriminator pass
|
||||||
|
with torch.no_grad():
|
||||||
|
y_hat = model_G(c_G)
|
||||||
|
|
||||||
|
# PQMF formatting
|
||||||
|
if y_hat.shape[1] > 1:
|
||||||
|
y_hat = model_G.pqmf_synthesis(y_hat)
|
||||||
|
|
||||||
|
# run D with or without cond. features
|
||||||
|
if len(signature(model_D.forward).parameters) == 2:
|
||||||
|
D_out_fake = model_D(y_hat.detach(), c_G)
|
||||||
|
D_out_real = model_D(y_G, c_G)
|
||||||
|
else:
|
||||||
|
D_out_fake = model_D(y_hat.detach())
|
||||||
|
D_out_real = model_D(y_G)
|
||||||
|
|
||||||
|
# format D outputs
|
||||||
|
if isinstance(D_out_fake, tuple):
|
||||||
|
scores_fake, feats_fake = D_out_fake
|
||||||
|
if D_out_real is None:
|
||||||
|
scores_real, feats_real = None, None
|
||||||
|
else:
|
||||||
|
scores_real, feats_real = D_out_real
|
||||||
|
else:
|
||||||
|
scores_fake = D_out_fake
|
||||||
|
scores_real = D_out_real
|
||||||
|
|
||||||
|
# compute losses
|
||||||
|
loss_D_dict = criterion_D(scores_fake, scores_real)
|
||||||
|
|
||||||
|
for key, value in loss_D_dict.items():
|
||||||
|
if isinstance(value, (int, float)):
|
||||||
|
loss_dict[key] = value
|
||||||
|
else:
|
||||||
|
loss_dict[key] = value.item()
|
||||||
|
|
||||||
|
|
||||||
|
step_time = time.time() - start_time
|
||||||
|
epoch_time += step_time
|
||||||
|
|
||||||
|
# update avg stats
|
||||||
|
update_eval_values = dict()
|
||||||
|
for key, value in loss_dict.items():
|
||||||
|
update_eval_values['avg_' + key] = value
|
||||||
|
update_eval_values['avg_loader_time'] = loader_time
|
||||||
|
update_eval_values['avg_step_time'] = step_time
|
||||||
|
keep_avg.update_values(update_eval_values)
|
||||||
|
|
||||||
|
# print eval stats
|
||||||
|
if c.print_eval:
|
||||||
|
c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
|
||||||
|
|
||||||
|
# compute spectrograms
|
||||||
|
figures = plot_results(y_hat, y_G, ap, global_step, 'eval')
|
||||||
|
tb_logger.tb_eval_figures(global_step, figures)
|
||||||
|
|
||||||
|
# Sample audio
|
||||||
|
sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy()
|
||||||
|
tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice},
|
||||||
|
c.audio["sample_rate"])
|
||||||
|
|
||||||
|
# synthesize a full voice
|
||||||
|
data_loader.return_segments = False
|
||||||
|
|
||||||
|
tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
|
||||||
|
|
||||||
|
return keep_avg.avg_values
|
||||||
|
|
||||||
|
|
||||||
|
# FIXME: move args definition/parsing inside of main?
|
||||||
|
def main(args): # pylint: disable=redefined-outer-name
|
||||||
|
# pylint: disable=global-variable-undefined
|
||||||
|
global train_data, eval_data
|
||||||
|
print(f" > Loading wavs from: {c.data_path}")
|
||||||
|
if c.feature_path is not None:
|
||||||
|
print(f" > Loading features from: {c.feature_path}")
|
||||||
|
eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size)
|
||||||
|
else:
|
||||||
|
eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size)
|
||||||
|
|
||||||
|
# setup audio processor
|
||||||
|
ap = AudioProcessor(**c.audio)
|
||||||
|
|
||||||
|
# DISTRUBUTED
|
||||||
|
# if num_gpus > 1:
|
||||||
|
# init_distributed(args.rank, num_gpus, args.group_id,
|
||||||
|
# c.distributed["backend"], c.distributed["url"])
|
||||||
|
|
||||||
|
# setup models
|
||||||
|
model_gen = setup_generator(c)
|
||||||
|
model_disc = setup_discriminator(c)
|
||||||
|
|
||||||
|
# setup optimizers
|
||||||
|
optimizer_gen = RAdam(model_gen.parameters(), lr=c.lr_gen, weight_decay=0)
|
||||||
|
optimizer_disc = RAdam(model_disc.parameters(),
|
||||||
|
lr=c.lr_disc,
|
||||||
|
weight_decay=0)
|
||||||
|
|
||||||
|
# schedulers
|
||||||
|
scheduler_gen = None
|
||||||
|
scheduler_disc = None
|
||||||
|
if 'lr_scheduler_gen' in c:
|
||||||
|
scheduler_gen = getattr(torch.optim.lr_scheduler, c.lr_scheduler_gen)
|
||||||
|
scheduler_gen = scheduler_gen(optimizer_gen, **c.lr_scheduler_gen_params)
|
||||||
|
if 'lr_scheduler_disc' in c:
|
||||||
|
scheduler_disc = getattr(torch.optim.lr_scheduler, c.lr_scheduler_disc)
|
||||||
|
scheduler_disc = scheduler_disc(optimizer_disc, **c.lr_scheduler_disc_params)
|
||||||
|
|
||||||
|
# setup criterion
|
||||||
|
criterion_gen = GeneratorLoss(c)
|
||||||
|
criterion_disc = DiscriminatorLoss(c)
|
||||||
|
|
||||||
|
if args.restore_path:
|
||||||
|
checkpoint = torch.load(args.restore_path, map_location='cpu')
|
||||||
|
try:
|
||||||
|
print(" > Restoring Generator Model...")
|
||||||
|
model_gen.load_state_dict(checkpoint['model'])
|
||||||
|
print(" > Restoring Generator Optimizer...")
|
||||||
|
optimizer_gen.load_state_dict(checkpoint['optimizer'])
|
||||||
|
print(" > Restoring Discriminator Model...")
|
||||||
|
model_disc.load_state_dict(checkpoint['model_disc'])
|
||||||
|
print(" > Restoring Discriminator Optimizer...")
|
||||||
|
optimizer_disc.load_state_dict(checkpoint['optimizer_disc'])
|
||||||
|
if 'scheduler' in checkpoint:
|
||||||
|
print(" > Restoring Generator LR Scheduler...")
|
||||||
|
scheduler_gen.load_state_dict(checkpoint['scheduler'])
|
||||||
|
# NOTE: Not sure if necessary
|
||||||
|
scheduler_gen.optimizer = optimizer_gen
|
||||||
|
if 'scheduler_disc' in checkpoint:
|
||||||
|
print(" > Restoring Discriminator LR Scheduler...")
|
||||||
|
scheduler_disc.load_state_dict(checkpoint['scheduler_disc'])
|
||||||
|
scheduler_disc.optimizer = optimizer_disc
|
||||||
|
except RuntimeError:
|
||||||
|
# retore only matching layers.
|
||||||
|
print(" > Partial model initialization...")
|
||||||
|
model_dict = model_gen.state_dict()
|
||||||
|
model_dict = set_init_dict(model_dict, checkpoint['model'], c)
|
||||||
|
model_gen.load_state_dict(model_dict)
|
||||||
|
|
||||||
|
model_dict = model_disc.state_dict()
|
||||||
|
model_dict = set_init_dict(model_dict, checkpoint['model_disc'], c)
|
||||||
|
model_disc.load_state_dict(model_dict)
|
||||||
|
del model_dict
|
||||||
|
|
||||||
|
# reset lr if not countinuining training.
|
||||||
|
for group in optimizer_gen.param_groups:
|
||||||
|
group['lr'] = c.lr_gen
|
||||||
|
|
||||||
|
for group in optimizer_disc.param_groups:
|
||||||
|
group['lr'] = c.lr_disc
|
||||||
|
|
||||||
|
print(" > Model restored from step %d" % checkpoint['step'],
|
||||||
|
flush=True)
|
||||||
|
args.restore_step = checkpoint['step']
|
||||||
|
else:
|
||||||
|
args.restore_step = 0
|
||||||
|
|
||||||
|
if use_cuda:
|
||||||
|
model_gen.cuda()
|
||||||
|
criterion_gen.cuda()
|
||||||
|
model_disc.cuda()
|
||||||
|
criterion_disc.cuda()
|
||||||
|
|
||||||
|
# DISTRUBUTED
|
||||||
|
# if num_gpus > 1:
|
||||||
|
# model = apply_gradient_allreduce(model)
|
||||||
|
|
||||||
|
num_params = count_parameters(model_gen)
|
||||||
|
print(" > Generator has {} parameters".format(num_params), flush=True)
|
||||||
|
num_params = count_parameters(model_disc)
|
||||||
|
print(" > Discriminator has {} parameters".format(num_params), flush=True)
|
||||||
|
|
||||||
|
if 'best_loss' not in locals():
|
||||||
|
best_loss = float('inf')
|
||||||
|
|
||||||
|
global_step = args.restore_step
|
||||||
|
for epoch in range(0, c.epochs):
|
||||||
|
c_logger.print_epoch_start(epoch, c.epochs)
|
||||||
|
_, global_step = train(model_gen, criterion_gen, optimizer_gen,
|
||||||
|
model_disc, criterion_disc, optimizer_disc,
|
||||||
|
scheduler_gen, scheduler_disc, ap, global_step,
|
||||||
|
epoch)
|
||||||
|
eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, criterion_disc, ap,
|
||||||
|
global_step, epoch)
|
||||||
|
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
|
||||||
|
target_loss = eval_avg_loss_dict[c.target_loss]
|
||||||
|
best_loss = save_best_model(target_loss,
|
||||||
|
best_loss,
|
||||||
|
model_gen,
|
||||||
|
optimizer_gen,
|
||||||
|
scheduler_gen,
|
||||||
|
model_disc,
|
||||||
|
optimizer_disc,
|
||||||
|
scheduler_disc,
|
||||||
|
global_step,
|
||||||
|
epoch,
|
||||||
|
OUT_PATH,
|
||||||
|
model_losses=eval_avg_loss_dict)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
'--continue_path',
|
||||||
|
type=str,
|
||||||
|
help=
|
||||||
|
'Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
|
||||||
|
default='',
|
||||||
|
required='--config_path' not in sys.argv)
|
||||||
|
parser.add_argument(
|
||||||
|
'--restore_path',
|
||||||
|
type=str,
|
||||||
|
help='Model file to be restored. Use to finetune a model.',
|
||||||
|
default='')
|
||||||
|
parser.add_argument('--config_path',
|
||||||
|
type=str,
|
||||||
|
help='Path to config file for training.',
|
||||||
|
required='--continue_path' not in sys.argv)
|
||||||
|
parser.add_argument('--debug',
|
||||||
|
type=bool,
|
||||||
|
default=False,
|
||||||
|
help='Do not verify commit integrity to run training.')
|
||||||
|
|
||||||
|
# DISTRUBUTED
|
||||||
|
parser.add_argument(
|
||||||
|
'--rank',
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help='DISTRIBUTED: process rank for distributed training.')
|
||||||
|
parser.add_argument('--group_id',
|
||||||
|
type=str,
|
||||||
|
default="",
|
||||||
|
help='DISTRIBUTED: process group id.')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.continue_path != '':
|
||||||
|
args.output_path = args.continue_path
|
||||||
|
args.config_path = os.path.join(args.continue_path, 'config.json')
|
||||||
|
list_of_files = glob.glob(
|
||||||
|
args.continue_path +
|
||||||
|
"/*.pth.tar") # * means all if need specific format then *.csv
|
||||||
|
latest_model_file = max(list_of_files, key=os.path.getctime)
|
||||||
|
args.restore_path = latest_model_file
|
||||||
|
print(f" > Training continues for {args.restore_path}")
|
||||||
|
|
||||||
|
# setup output paths and read configs
|
||||||
|
c = load_config(args.config_path)
|
||||||
|
check_config(c)
|
||||||
|
_ = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
|
||||||
|
OUT_PATH = args.continue_path
|
||||||
|
if args.continue_path == '':
|
||||||
|
OUT_PATH = create_experiment_folder(c.output_path, c.run_name,
|
||||||
|
args.debug)
|
||||||
|
|
||||||
|
AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')
|
||||||
|
|
||||||
|
c_logger = ConsoleLogger()
|
||||||
|
|
||||||
|
if args.rank == 0:
|
||||||
|
os.makedirs(AUDIO_PATH, exist_ok=True)
|
||||||
|
new_fields = {}
|
||||||
|
if args.restore_path:
|
||||||
|
new_fields["restore_path"] = args.restore_path
|
||||||
|
new_fields["github_branch"] = get_git_branch()
|
||||||
|
copy_config_file(args.config_path,
|
||||||
|
os.path.join(OUT_PATH, 'config.json'), new_fields)
|
||||||
|
os.chmod(AUDIO_PATH, 0o775)
|
||||||
|
os.chmod(OUT_PATH, 0o775)
|
||||||
|
|
||||||
|
LOG_DIR = OUT_PATH
|
||||||
|
tb_logger = TensorboardLogger(LOG_DIR, model_name='VOCODER')
|
||||||
|
|
||||||
|
# write model desc to tensorboard
|
||||||
|
tb_logger.tb_add_text('model-description', c['run_description'], 0)
|
||||||
|
|
||||||
|
try:
|
||||||
|
main(args)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
remove_experiment_folder(OUT_PATH)
|
||||||
|
try:
|
||||||
|
sys.exit(0)
|
||||||
|
except SystemExit:
|
||||||
|
os._exit(0) # pylint: disable=protected-access
|
||||||
|
except Exception: # pylint: disable=broad-except
|
||||||
|
remove_experiment_folder(OUT_PATH)
|
||||||
|
traceback.print_exc()
|
||||||
|
sys.exit(1)
|
|
@ -0,0 +1,97 @@
|
||||||
|
import datetime
|
||||||
|
from TTS.utils.io import AttrDict
|
||||||
|
|
||||||
|
|
||||||
|
tcolors = AttrDict({
|
||||||
|
'OKBLUE': '\033[94m',
|
||||||
|
'HEADER': '\033[95m',
|
||||||
|
'OKGREEN': '\033[92m',
|
||||||
|
'WARNING': '\033[93m',
|
||||||
|
'FAIL': '\033[91m',
|
||||||
|
'ENDC': '\033[0m',
|
||||||
|
'BOLD': '\033[1m',
|
||||||
|
'UNDERLINE': '\033[4m'
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
class ConsoleLogger():
|
||||||
|
# TODO: merge this with TTS ConsoleLogger
|
||||||
|
def __init__(self):
|
||||||
|
# use these to compare values between iterations
|
||||||
|
self.old_train_loss_dict = None
|
||||||
|
self.old_epoch_loss_dict = None
|
||||||
|
self.old_eval_loss_dict = None
|
||||||
|
|
||||||
|
# pylint: disable=no-self-use
|
||||||
|
def get_time(self):
|
||||||
|
now = datetime.datetime.now()
|
||||||
|
return now.strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
def print_epoch_start(self, epoch, max_epoch):
|
||||||
|
print("\n{}{} > EPOCH: {}/{}{}".format(tcolors.UNDERLINE, tcolors.BOLD,
|
||||||
|
epoch, max_epoch, tcolors.ENDC),
|
||||||
|
flush=True)
|
||||||
|
|
||||||
|
def print_train_start(self):
|
||||||
|
print(f"\n{tcolors.BOLD} > TRAINING ({self.get_time()}) {tcolors.ENDC}")
|
||||||
|
|
||||||
|
def print_train_step(self, batch_steps, step, global_step,
|
||||||
|
step_time, loader_time, lrG, lrD,
|
||||||
|
loss_dict, avg_loss_dict):
|
||||||
|
indent = " | > "
|
||||||
|
print()
|
||||||
|
log_text = "{} --> STEP: {}/{} -- GLOBAL_STEP: {}{}\n".format(
|
||||||
|
tcolors.BOLD, step, batch_steps, global_step, tcolors.ENDC)
|
||||||
|
for key, value in loss_dict.items():
|
||||||
|
# print the avg value if given
|
||||||
|
if f'avg_{key}' in avg_loss_dict.keys():
|
||||||
|
log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}'])
|
||||||
|
else:
|
||||||
|
log_text += "{}{}: {:.5f} \n".format(indent, key, value)
|
||||||
|
log_text += f"{indent}step_time: {step_time:.2f}\n{indent}loader_time: {loader_time:.2f}\n{indent}lrG: {lrG}\n{indent}lrD: {lrD}"
|
||||||
|
print(log_text, flush=True)
|
||||||
|
|
||||||
|
# pylint: disable=unused-argument
|
||||||
|
def print_train_epoch_end(self, global_step, epoch, epoch_time,
|
||||||
|
print_dict):
|
||||||
|
indent = " | > "
|
||||||
|
log_text = f"\n{tcolors.BOLD} --> TRAIN PERFORMACE -- EPOCH TIME: {epoch} sec -- GLOBAL_STEP: {global_step}{tcolors.ENDC}\n"
|
||||||
|
for key, value in print_dict.items():
|
||||||
|
log_text += "{}{}: {:.5f}\n".format(indent, key, value)
|
||||||
|
print(log_text, flush=True)
|
||||||
|
|
||||||
|
def print_eval_start(self):
|
||||||
|
print(f"{tcolors.BOLD} > EVALUATION {tcolors.ENDC}\n")
|
||||||
|
|
||||||
|
def print_eval_step(self, step, loss_dict, avg_loss_dict):
|
||||||
|
indent = " | > "
|
||||||
|
print()
|
||||||
|
log_text = f"{tcolors.BOLD} --> STEP: {step}{tcolors.ENDC}\n"
|
||||||
|
for key, value in loss_dict.items():
|
||||||
|
# print the avg value if given
|
||||||
|
if f'avg_{key}' in avg_loss_dict.keys():
|
||||||
|
log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}'])
|
||||||
|
else:
|
||||||
|
log_text += "{}{}: {:.5f} \n".format(indent, key, value)
|
||||||
|
print(log_text, flush=True)
|
||||||
|
|
||||||
|
def print_epoch_end(self, epoch, avg_loss_dict):
|
||||||
|
indent = " | > "
|
||||||
|
log_text = " {}--> EVAL PERFORMANCE{}\n".format(
|
||||||
|
tcolors.BOLD, tcolors.ENDC)
|
||||||
|
for key, value in avg_loss_dict.items():
|
||||||
|
# print the avg value if given
|
||||||
|
color = ''
|
||||||
|
sign = '+'
|
||||||
|
diff = 0
|
||||||
|
if self.old_eval_loss_dict is not None and key in self.old_eval_loss_dict:
|
||||||
|
diff = value - self.old_eval_loss_dict[key]
|
||||||
|
if diff < 0:
|
||||||
|
color = tcolors.OKGREEN
|
||||||
|
sign = ''
|
||||||
|
elif diff > 0:
|
||||||
|
color = tcolors.FAIL
|
||||||
|
sign = '+'
|
||||||
|
log_text += "{}{}:{} {:.5f} {}({}{:.5f})\n".format(indent, key, color, value, tcolors.ENDC, sign, diff)
|
||||||
|
self.old_eval_loss_dict = avg_loss_dict
|
||||||
|
print(log_text, flush=True)
|
|
@ -0,0 +1,102 @@
|
||||||
|
import re
|
||||||
|
import importlib
|
||||||
|
import numpy as np
|
||||||
|
from matplotlib import pyplot as plt
|
||||||
|
|
||||||
|
from TTS.utils.visual import plot_spectrogram
|
||||||
|
|
||||||
|
|
||||||
|
def plot_results(y_hat, y, ap, global_step, name_prefix):
|
||||||
|
""" Plot vocoder model results """
|
||||||
|
|
||||||
|
# select an instance from batch
|
||||||
|
y_hat = y_hat[0].squeeze(0).detach().cpu().numpy()
|
||||||
|
y = y[0].squeeze(0).detach().cpu().numpy()
|
||||||
|
|
||||||
|
spec_fake = ap.melspectrogram(y_hat).T
|
||||||
|
spec_real = ap.melspectrogram(y).T
|
||||||
|
spec_diff = np.abs(spec_fake - spec_real)
|
||||||
|
|
||||||
|
# plot figure and save it
|
||||||
|
fig_wave = plt.figure()
|
||||||
|
plt.subplot(2, 1, 1)
|
||||||
|
plt.plot(y)
|
||||||
|
plt.title("groundtruth speech")
|
||||||
|
plt.subplot(2, 1, 2)
|
||||||
|
plt.plot(y_hat)
|
||||||
|
plt.title(f"generated speech @ {global_step} steps")
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
figures = {
|
||||||
|
name_prefix + "spectrogram/fake": plot_spectrogram(spec_fake),
|
||||||
|
name_prefix + "spectrogram/real": plot_spectrogram(spec_real),
|
||||||
|
name_prefix + "spectrogram/diff": plot_spectrogram(spec_diff),
|
||||||
|
name_prefix + "speech_comparison": fig_wave,
|
||||||
|
}
|
||||||
|
return figures
|
||||||
|
|
||||||
|
|
||||||
|
def to_camel(text):
|
||||||
|
text = text.capitalize()
|
||||||
|
return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text)
|
||||||
|
|
||||||
|
|
||||||
|
def setup_generator(c):
|
||||||
|
print(" > Generator Model: {}".format(c.generator_model))
|
||||||
|
MyModel = importlib.import_module('TTS.vocoder.models.' +
|
||||||
|
c.generator_model.lower())
|
||||||
|
MyModel = getattr(MyModel, to_camel(c.generator_model))
|
||||||
|
if c.generator_model in 'melgan_generator':
|
||||||
|
model = MyModel(
|
||||||
|
in_channels=c.audio['num_mels'],
|
||||||
|
out_channels=1,
|
||||||
|
proj_kernel=7,
|
||||||
|
base_channels=512,
|
||||||
|
upsample_factors=c.generator_model_params['upsample_factors'],
|
||||||
|
res_kernel=3,
|
||||||
|
num_res_blocks=c.generator_model_params['num_res_blocks'])
|
||||||
|
if c.generator_model in 'melgan_fb_generator':
|
||||||
|
pass
|
||||||
|
if c.generator_model in 'multiband_melgan_generator':
|
||||||
|
model = MyModel(
|
||||||
|
in_channels=c.audio['num_mels'],
|
||||||
|
out_channels=4,
|
||||||
|
proj_kernel=7,
|
||||||
|
base_channels=384,
|
||||||
|
upsample_factors=c.generator_model_params['upsample_factors'],
|
||||||
|
res_kernel=3,
|
||||||
|
num_res_blocks=c.generator_model_params['num_res_blocks'])
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def setup_discriminator(c):
|
||||||
|
print(" > Discriminator Model: {}".format(c.discriminator_model))
|
||||||
|
MyModel = importlib.import_module('TTS.vocoder.models.' +
|
||||||
|
c.discriminator_model.lower())
|
||||||
|
MyModel = getattr(MyModel, to_camel(c.discriminator_model))
|
||||||
|
if c.discriminator_model in 'random_window_discriminator':
|
||||||
|
model = MyModel(
|
||||||
|
cond_channels=c.audio['num_mels'],
|
||||||
|
hop_length=c.audio['hop_length'],
|
||||||
|
uncond_disc_donwsample_factors=c.
|
||||||
|
discriminator_model_params['uncond_disc_donwsample_factors'],
|
||||||
|
cond_disc_downsample_factors=c.
|
||||||
|
discriminator_model_params['cond_disc_downsample_factors'],
|
||||||
|
cond_disc_out_channels=c.
|
||||||
|
discriminator_model_params['cond_disc_out_channels'],
|
||||||
|
window_sizes=c.discriminator_model_params['window_sizes'])
|
||||||
|
if c.discriminator_model in 'melgan_multiscale_discriminator':
|
||||||
|
model = MyModel(
|
||||||
|
in_channels=1,
|
||||||
|
out_channels=1,
|
||||||
|
kernel_sizes=(5, 3),
|
||||||
|
base_channels=c.discriminator_model_params['base_channels'],
|
||||||
|
max_channels=c.discriminator_model_params['max_channels'],
|
||||||
|
downsample_factors=c.
|
||||||
|
discriminator_model_params['downsample_factors'])
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def check_config(c):
|
||||||
|
pass
|
|
@ -0,0 +1,63 @@
|
||||||
|
import os
|
||||||
|
import torch
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
|
||||||
|
def save_model(model, optimizer, scheduler, model_disc, optimizer_disc,
|
||||||
|
scheduler_disc, current_step, epoch, output_path, **kwargs):
|
||||||
|
model_state = model.state_dict()
|
||||||
|
model_disc_state = model_disc.state_dict()\
|
||||||
|
if model_disc is not None else None
|
||||||
|
optimizer_state = optimizer.state_dict()\
|
||||||
|
if optimizer is not None else None
|
||||||
|
optimizer_disc_state = optimizer_disc.state_dict()\
|
||||||
|
if optimizer_disc is not None else None
|
||||||
|
scheduler_state = scheduler.state_dict()\
|
||||||
|
if scheduler is not None else None
|
||||||
|
scheduler_disc_state = scheduler_disc.state_dict()\
|
||||||
|
if scheduler_disc is not None else None
|
||||||
|
state = {
|
||||||
|
'model': model_state,
|
||||||
|
'optimizer': optimizer_state,
|
||||||
|
'scheduler': scheduler_state,
|
||||||
|
'model_disc': model_disc_state,
|
||||||
|
'optimizer_disc': optimizer_disc_state,
|
||||||
|
'scheduler_disc': scheduler_disc_state,
|
||||||
|
'step': current_step,
|
||||||
|
'epoch': epoch,
|
||||||
|
'date': datetime.date.today().strftime("%B %d, %Y"),
|
||||||
|
}
|
||||||
|
state.update(kwargs)
|
||||||
|
torch.save(state, output_path)
|
||||||
|
|
||||||
|
|
||||||
|
def save_checkpoint(model, optimizer, scheduler, model_disc, optimizer_disc,
|
||||||
|
scheduler_disc, current_step, epoch, output_folder,
|
||||||
|
**kwargs):
|
||||||
|
file_name = 'checkpoint_{}.pth.tar'.format(current_step)
|
||||||
|
checkpoint_path = os.path.join(output_folder, file_name)
|
||||||
|
print(" > CHECKPOINT : {}".format(checkpoint_path))
|
||||||
|
save_model(model, optimizer, scheduler, model_disc, optimizer_disc,
|
||||||
|
scheduler_disc, current_step, epoch, checkpoint_path, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def save_best_model(target_loss, best_loss, model, optimizer, scheduler,
|
||||||
|
model_disc, optimizer_disc, scheduler_disc, current_step,
|
||||||
|
epoch, output_folder, **kwargs):
|
||||||
|
if target_loss < best_loss:
|
||||||
|
file_name = 'best_model.pth.tar'
|
||||||
|
checkpoint_path = os.path.join(output_folder, file_name)
|
||||||
|
print(" > BEST MODEL : {}".format(checkpoint_path))
|
||||||
|
save_model(model,
|
||||||
|
optimizer,
|
||||||
|
scheduler,
|
||||||
|
model_disc,
|
||||||
|
optimizer_disc,
|
||||||
|
scheduler_disc,
|
||||||
|
current_step,
|
||||||
|
epoch,
|
||||||
|
checkpoint_path,
|
||||||
|
model_loss=target_loss,
|
||||||
|
**kwargs)
|
||||||
|
best_loss = target_loss
|
||||||
|
return best_loss
|
Loading…
Reference in New Issue