Merge pull request #258 from mozilla/dev

Dev
This commit is contained in:
Eren Gölge 2019-09-10 13:28:08 +03:00 committed by GitHub
commit 83f73861bd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
51 changed files with 790 additions and 555 deletions

View File

@ -4,13 +4,13 @@ yes | apt-get install ffmpeg
yes | apt-get install espeak yes | apt-get install espeak
yes | apt-get install tmux yes | apt-get install tmux
yes | apt-get install zsh yes | apt-get install zsh
pip3 install https://download.pytorch.org/whl/cu100/torch-1.1.0-cp37-cp37m-linux_x86_64.whl # pip3 install https://download.pytorch.org/whl/cu100/torch-1.1.0-cp37-cp37m-linux_x86_64.whl
# wget https://www.dropbox.com/s/m8waow6b3ydpf6h/MozillaDataset.tar.gz?dl=0 -O /data/rw/home/mozilla.tar # wget https://www.dropbox.com/s/m8waow6b3ydpf6h/MozillaDataset.tar.gz?dl=0 -O /data/rw/home/mozilla.tar
wget https://www.dropbox.com/s/wqn5v3wkktw9lmo/install.sh?dl=0 -O install.sh wget https://www.dropbox.com/s/wqn5v3wkktw9lmo/install.sh?dl=0 -O install.sh
sudo sh install.sh sudo sh install.sh
python3 setup.py develop python3 setup.py develop
# cp -R ${USER_DIR}/GermanData ../tmp/ # cp -R ${USER_DIR}/GermanData ../tmp/
python3 distribute.py --config_path config_libritts.json --data_path /data/rw/home/LibriTTS/train-clean-360/ # python3 distribute.py --config_path config.json --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/
# cp -R ${USER_DIR}/Mozilla_22050 ../tmp/ # cp -R ${USER_DIR}/Mozilla_22050 ../tmp/
# python3 distribute.py --config_path config_tacotron_gst.json --data_path ../tmp/Mozilla_22050/ # python3 distribute.py --config_path config_tacotron_gst.json --data_path ../tmp/Mozilla_22050/
while true; do sleep 1000000; done while true; do sleep 1000000; done

View File

@ -11,5 +11,7 @@ fi
if [[ "$TEST_SUITE" == "unittest" ]]; then if [[ "$TEST_SUITE" == "unittest" ]]; then
# Run tests on all pushes # Run tests on all pushes
pushd tts_namespace
python -m unittest python -m unittest
popd
fi fi

View File

@ -10,9 +10,9 @@ TTS includes two different model implementations which are based on [Tacotron](h
If you are new, you can also find [here](http://www.erogol.com/text-speech-deep-learning-architectures/) a brief post about TTS architectures and their comparisons. If you are new, you can also find [here](http://www.erogol.com/text-speech-deep-learning-architectures/) a brief post about TTS architectures and their comparisons.
## TTS Performance ## TTS Performance
<p align="center"><img src="https://user-images.githubusercontent.com/1402048/56998082-36d43500-6baa-11e9-8ca3-6c91d3a747bf.png"/></p> <p align="center"><img src="https://camo.githubusercontent.com/9fa79f977015e55eb9ec7aa32045555f60d093d3/68747470733a2f2f646973636f757273652d706161732d70726f64756374696f6e2d636f6e74656e742e73332e6475616c737461636b2e75732d656173742d312e616d617a6f6e6177732e636f6d2f6f7074696d697a65642f33582f362f342f363432386639383065396563373531633234386535393134363038393566373838316165633063365f325f363930783339342e706e67"/></p>
[Details...](https://github.com/mozilla/TTS/issues/186) [Details...](https://github.com/mozilla/TTS/wiki/Mean-Opinion-Score-Results)
## Requirements and Installation ## Requirements and Installation
Highly recommended to use [miniconda](https://conda.io/miniconda.html) for easier installation. Highly recommended to use [miniconda](https://conda.io/miniconda.html) for easier installation.

0
__init__.py Normal file
View File

View File

@ -1,6 +1,6 @@
{ {
"run_name": "mozilla-no-loc-fattn-stopnet-sigmoid-loss_masking", "run_name": "ljspeech",
"run_description": "using forward attention, with original prenet, loss masking,separate stopnet, sigmoid. Compare this with 4817. Pytorch DPP", "run_description": "gradual training with prenet frame size 1 + no maxout for cbhg + symmetric norm.",
"audio":{ "audio":{
// Audio processing parameters // Audio processing parameters
@ -16,8 +16,8 @@
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
// Normalization parameters // Normalization parameters
"signal_norm": true, // normalize the spec values in range [0, 1] "signal_norm": true, // normalize the spec values in range [0, 1]
"symmetric_norm": false, // move normalization to range [-1, 1] "symmetric_norm": true, // move normalization to range [-1, 1]
"max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] "max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range. "clip_norm": true, // clip normalized values into the range.
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
@ -31,44 +31,45 @@
"reinit_layers": [], "reinit_layers": [],
"model": "Tacotron2", // one of the model in models/ "model": "Tacotron", // one of the model in models/
"grad_clip": 1, // upper limit for gradients for clipping. "grad_clip": 1, // upper limit for gradients for clipping.
"epochs": 1000, // total number of epochs to train. "epochs": 1000, // total number of epochs to train.
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
"lr_decay": false, // if true, Noam learning rate decaying is applied through training. "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
"windowing": false, // Enables attention windowing. Used only in eval mode. "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
"memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
"attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
"prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". "prenet_type": "original", // "original" or "bn".
"prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. "prenet_dropout": true, // enable/disable dropout at prenet.
"use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. "windowing": false, // Enables attention windowing. Used only in eval mode.
"use_forward_attn": false, // if it uses forward attention. In general, it aligns faster.
"forward_attn_mask": false, "forward_attn_mask": false,
"transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. "transition_agent": false, // enable/disable transition agent of forward attention.
"location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
"loss_masking": true, // enable / disable loss masking against the sequence padding. "loss_masking": true, // enable / disable loss masking against the sequence padding.
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
"stopnet": true, // Train stopnet predicting the end of synthesis. "stopnet": true, // Train stopnet predicting the end of synthesis.
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"eval_batch_size":16, "eval_batch_size":16,
"r": 1, // Number of frames to predict for step. "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
"gradual_training": [[0, 7, 32], [10000, 5, 32], [50000, 3, 32], [130000, 2, 16], [290000, 1, 8]], // ONLY TACOTRON - set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled.
"wd": 0.000001, // Weight decay weight. "wd": 0.000001, // Weight decay weight.
"checkpoint": true, // If true, it saves checkpoints per "save_step" "checkpoint": true, // If true, it saves checkpoints per "save_step"
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. "save_step": 10000, // Number of training steps expected to save traning stats and checkpoints.
"print_step": 10, // Number of steps to log traning on console. "print_step": 25, // Number of steps to log traning on console.
"batch_group_size": 0, //Number of batches to shuffle after bucketing. "batch_group_size": 0, //Number of batches to shuffle after bucketing.
"run_eval": true, "run_eval": true,
"test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time. "test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time.
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
"data_path": "/media/erogol/data_ssd/Data/Mozilla/", // DATASET-RELATED: can overwritten from command argument "data_path": "/home/erogol/Data/LJSpeech-1.1/", // DATASET-RELATED: can overwritten from command argument
"meta_file_train": "metadata_train.txt", // DATASET-RELATED: metafile for training dataloader. "meta_file_train": "metadata_train.csv", // DATASET-RELATED: metafile for training dataloader.
"meta_file_val": "metadata_val.txt", // DATASET-RELATED: metafile for evaluation dataloader. "meta_file_val": "metadata_val.csv", // DATASET-RELATED: metafile for evaluation dataloader.
"dataset": "mozilla", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py "dataset": "ljspeech", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
"min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training
"max_seq_len": 150, // DATASET-RELATED: maximum text length "max_seq_len": 150, // DATASET-RELATED: maximum text length
"output_path": "../keep/", // DATASET-RELATED: output path for all training outputs. "output_path": "../keep/", // DATASET-RELATED: output path for all training outputs.
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
@ -77,6 +78,7 @@
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
"text_cleaner": "phoneme_cleaners", "text_cleaner": "phoneme_cleaners",
"use_speaker_embedding": false // whether to use additional embeddings for separate speakers "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
"style_wav_for_test": null // path to style wav file to be used in TacotronGST inference.
} }

View File

@ -1,41 +0,0 @@
{
"model_name": "TTS-larger-kusal",
"audio_processor": "audio",
"num_mels": 80,
"num_freq": 1025,
"sample_rate": 22000,
"frame_length_ms": 50,
"frame_shift_ms": 12.5,
"preemphasis": 0.97,
"min_mel_freq": 125,
"max_mel_freq": 7600,
"min_level_db": -100,
"ref_level_db": 20,
"embedding_size": 256,
"text_cleaner": "english_cleaners",
"epochs": 1000,
"lr": 0.002,
"lr_decay": 0.5,
"decay_step": 100000,
"warmup_steps": 4000,
"batch_size": 32,
"eval_batch_size":-1,
"r": 5,
"griffin_lim_iters": 60,
"power": 1.5,
"num_loader_workers": 8,
"checkpoint": true,
"save_step": 25000,
"print_step": 10,
"run_eval": false,
"data_path": "/snakepit/shared/data/mycroft/kusal/",
"meta_file_train": "prompts.txt",
"meta_file_val": null,
"dataset": "Kusal",
"min_seq_len": 0,
"output_path": "../keep/"
}

View File

@ -1,82 +0,0 @@
{
"run_name": "libritts-360",
"run_description": "LibriTTS 360 clean with multi speaker embedding.",
"audio":{
// Audio processing parameters
"num_mels": 80, // size of the mel spec frame.
"num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
"sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
"frame_length_ms": 50, // stft window length in ms.
"frame_shift_ms": 12.5, // stft window hop-lengh in ms.
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"min_level_db": -100, // normalization range
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
"power": 1.5, // value to sharpen wav signals after GL algorithm.
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
// Normalization parameters
"signal_norm": true, // normalize the spec values in range [0, 1]
"symmetric_norm": false, // move normalization to range [-1, 1]
"max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
"do_trim_silence": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
},
"distributed":{
"backend": "nccl",
"url": "tcp:\/\/localhost:54321"
},
"reinit_layers": [],
"model": "Tacotron2", // one of the model in models/
"grad_clip": 1, // upper limit for gradients for clipping.
"epochs": 1000, // total number of epochs to train.
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
"lr_decay": false, // if true, Noam learning rate decaying is applied through training.
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
"memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
"attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
"prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn".
"prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet.
"windowing": false, // Enables attention windowing. Used only in eval mode.
"use_forward_attn": false, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
"forward_attn_mask": false,
"transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention.
"location_attn": true, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default.
"loss_masking": true, // enable / disable loss masking against the sequence padding.
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
"stopnet": true, // Train stopnet predicting the end of synthesis.
"separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
"tb_model_param_stats": true, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
"batch_size": 24, // Batch size for training. Lower values than 32 might cause hard to learn attention.
"eval_batch_size":16,
"r": 1, // Number of frames to predict for step.
"wd": 0.000001, // Weight decay weight.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
"print_step": 10, // Number of steps to log traning on console.
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
"run_eval": true,
"test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time.
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
"data_path": "/home/erogol/Data/Libri-TTS/train-clean-360/", // DATASET-RELATED: can overwritten from command argument
"meta_file_train": null, // DATASET-RELATED: metafile for training dataloader.
"meta_file_val": null, // DATASET-RELATED: metafile for evaluation dataloader.
"dataset": "libri_tts", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
"min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training
"max_seq_len": 150, // DATASET-RELATED: maximum text length
"output_path": "/media/erogol/data_ssd/Models/libri_tts/", // DATASET-RELATED: output path for all training outputs.
"num_loader_workers": 12, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_val_loader_workers": 4, // number of evaluation data loader processes.
"phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
"text_cleaner": "phoneme_cleaners",
"use_speaker_embedding": true
}

View File

@ -42,10 +42,10 @@
"attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
"prenet_type": "original", // "original" or "bn". "prenet_type": "original", // "original" or "bn".
"prenet_dropout": true, // enable/disable dropout at prenet. "prenet_dropout": true, // enable/disable dropout at prenet.
"use_forward_attn": true, // if it uses forward attention. In general, it aligns faster. "use_forward_attn": true, // enable/disable forward attention. In general, it aligns faster.
"forward_attn_mask": false, // Apply forward attention mask af inference to prevent bad modes. Try it if your model does not align well. "forward_attn_mask": false, // Apply forward attention mask af inference to prevent bad modes. Try it if your model does not align well.
"transition_agent": true, // enable/disable transition agent of forward attention. "transition_agent": true, // enable/disable transition agent of forward attention.
"location_attn": false, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. "location_attn": false, // enable_disable location sensitive attention.
"loss_masking": true, // enable / disable loss masking against the sequence padding. "loss_masking": true, // enable / disable loss masking against the sequence padding.
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
"stopnet": true, // Train stopnet predicting the end of synthesis. "stopnet": true, // Train stopnet predicting the end of synthesis.

View File

@ -39,12 +39,12 @@
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
"memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
"attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
"prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". "prenet_type": "original", // "original" or "bn".
"prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. "prenet_dropout": true, // enable/disable dropout at prenet.
"use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. "use_forward_attn": true, // enable/disable forward attention. In general, it aligns faster.
"forward_attn_mask": false, // Apply forward attention mask af inference to prevent bad modes. Try it if your model does not align well. "forward_attn_mask": false, // Apply forward attention mask af inference to prevent bad modes. Try it if your model does not align well.
"transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. "transition_agent": false, // enable/disable transition agent of forward attention.
"location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. "location_attn": false, // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
"loss_masking": true, // enable / disable loss masking against the sequence padding. "loss_masking": true, // enable / disable loss masking against the sequence padding.
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
"stopnet": true, // Train stopnet predicting the end of synthesis. "stopnet": true, // Train stopnet predicting the end of synthesis.

View File

@ -40,12 +40,12 @@
"windowing": false, // Enables attention windowing. Used only in eval mode. "windowing": false, // Enables attention windowing. Used only in eval mode.
"memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
"attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
"prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". "prenet_type": "original", // "original" or "bn".
"prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. "prenet_dropout": true, // enable/disable dropout at prenet.
"use_forward_attn": false, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. "use_forward_attn": false, // enable/disable forward attention. In general, it aligns faster.
"transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. "transition_agent": false, // enable/disable transition agent of forward attention.
"forward_attn_mask": false, "forward_attn_mask": false, // Apply forward attention mask at inference to prevent bad modes. Try it if your model does not align well.
"location_attn": true, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
"loss_masking": true, // enable / disable loss masking against the sequence padding. "loss_masking": true, // enable / disable loss masking against the sequence padding.
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
"stopnet": true, // Train stopnet predicting the end of synthesis. "stopnet": true, // Train stopnet predicting the end of synthesis.

View File

@ -42,8 +42,8 @@
"attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
"prenet_type": "original", // "original" or "bn". "prenet_type": "original", // "original" or "bn".
"prenet_dropout": true, // enable/disable dropout at prenet. "prenet_dropout": true, // enable/disable dropout at prenet.
"use_forward_attn": true, // if it uses forward attention. In general, it aligns faster. "use_forward_attn": true, // enable/disable forward attention. In general, it aligns faster.
"forward_attn_mask": false, // Apply forward attention mask af inference to prevent bad modes. Try it if your model does not align well. "forward_attn_mask": false, // Apply forward attention mask at inference to prevent bad modes. Try it if your model does not align well.
"transition_agent": false, // enable/disable transition agent of forward attention. "transition_agent": false, // enable/disable transition agent of forward attention.
"location_attn": false, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. "location_attn": false, // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
"loss_masking": true, // enable / disable loss masking against the sequence padding. "loss_masking": true, // enable / disable loss masking against the sequence padding.
@ -77,6 +77,7 @@
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
"text_cleaner": "phoneme_cleaners", "text_cleaner": "phoneme_cleaners",
"use_speaker_embedding": false // whether to use additional embeddings for separate speakers "use_speaker_embedding": false, // whether to use additional embeddings for separate speakers
"style_wav_for_test": null // path to wav for styling the inference tests when using GST
} }

View File

@ -5,8 +5,8 @@ import torch
import random import random
from torch.utils.data import Dataset from torch.utils.data import Dataset
from utils.text import text_to_sequence, phoneme_to_sequence, pad_with_eos_bos from TTS.utils.text import text_to_sequence, phoneme_to_sequence, pad_with_eos_bos
from utils.data import prepare_data, prepare_tensor, prepare_stop_target from TTS.utils.data import prepare_data, prepare_tensor, prepare_stop_target
class MyDataset(Dataset): class MyDataset(Dataset):
@ -102,7 +102,7 @@ class MyDataset(Dataset):
cache_path) cache_path)
if self.enable_eos_bos: if self.enable_eos_bos:
phonemes = pad_with_eos_bos(phonemes) phonemes = pad_with_eos_bos(phonemes)
phonemes = np.asarray(phonemes, dtype=np.int32)
return phonemes return phonemes
def load_data(self, idx): def load_data(self, idx):

View File

@ -75,21 +75,19 @@ def mailabs(root_path, meta_files=None):
speaker_regex = re.compile("by_book/(male|female)/(?P<speaker_name>[^/]+)/") speaker_regex = re.compile("by_book/(male|female)/(?P<speaker_name>[^/]+)/")
if meta_files is None: if meta_files is None:
csv_files = glob(root_path+"/**/metadata.csv", recursive=True) csv_files = glob(root_path+"/**/metadata.csv", recursive=True)
folders = [os.path.dirname(f) for f in csv_files]
else: else:
csv_files = meta_files csv_files = meta_files
folders = [f.strip().split("by_book")[1][1:] for f in csv_files]
# meta_files = [f.strip() for f in meta_files.split(",")] # meta_files = [f.strip() for f in meta_files.split(",")]
items = [] items = []
for idx, csv_file in enumerate(csv_files): for csv_file in csv_files:
txt_file = os.path.join(root_path, csv_file)
folder = os.path.dirname(txt_file)
# determine speaker based on folder structure... # determine speaker based on folder structure...
speaker_name_match = speaker_regex.search(csv_file) speaker_name_match = speaker_regex.search(txt_file)
if speaker_name_match is None: if speaker_name_match is None:
continue continue
speaker_name = speaker_name_match.group("speaker_name") speaker_name = speaker_name_match.group("speaker_name")
print(" | > {}".format(csv_file)) print(" | > {}".format(csv_file))
folder = folders[idx]
txt_file = os.path.join(root_path, csv_file)
with open(txt_file, 'r') as ttf: with open(txt_file, 'r') as ttf:
for line in ttf: for line in ttf:
cols = line.split('|') cols = line.split('|')

View File

@ -9,7 +9,7 @@ import torch.distributed as dist
from torch.utils.data.sampler import Sampler from torch.utils.data.sampler import Sampler
from torch.autograd import Variable from torch.autograd import Variable
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from utils.generic_utils import load_config, create_experiment_folder from TTS.utils.generic_utils import load_config, create_experiment_folder
class DistributedSampler(Sampler): class DistributedSampler(Sampler):

View File

@ -108,19 +108,19 @@ class LocationLayer(nn.Module):
class Attention(nn.Module): class Attention(nn.Module):
# Pylint gets confused by PyTorch conventions here # Pylint gets confused by PyTorch conventions here
#pylint: disable=attribute-defined-outside-init #pylint: disable=attribute-defined-outside-init
def __init__(self, attention_rnn_dim, embedding_dim, attention_dim, def __init__(self, query_dim, embedding_dim, attention_dim,
location_attention, attention_location_n_filters, location_attention, attention_location_n_filters,
attention_location_kernel_size, windowing, norm, forward_attn, attention_location_kernel_size, windowing, norm, forward_attn,
trans_agent, forward_attn_mask): trans_agent, forward_attn_mask):
super(Attention, self).__init__() super(Attention, self).__init__()
self.query_layer = Linear( self.query_layer = Linear(
attention_rnn_dim, attention_dim, bias=False, init_gain='tanh') query_dim, attention_dim, bias=False, init_gain='tanh')
self.inputs_layer = Linear( self.inputs_layer = Linear(
embedding_dim, attention_dim, bias=False, init_gain='tanh') embedding_dim, attention_dim, bias=False, init_gain='tanh')
self.v = Linear(attention_dim, 1, bias=True) self.v = Linear(attention_dim, 1, bias=True)
if trans_agent: if trans_agent:
self.ta = nn.Linear( self.ta = nn.Linear(
attention_rnn_dim + embedding_dim, 1, bias=True) query_dim + embedding_dim, 1, bias=True)
if location_attention: if location_attention:
self.location_layer = LocationLayer( self.location_layer = LocationLayer(
attention_dim, attention_dim,
@ -201,16 +201,17 @@ class Attention(nn.Module):
self.win_idx = torch.argmax(attention, 1).long()[0].item() self.win_idx = torch.argmax(attention, 1).long()[0].item()
return attention return attention
def apply_forward_attention(self, inputs, alignment, query): def apply_forward_attention(self, alignment):
# forward attention # forward attention
prev_alpha = F.pad(self.alpha[:, :-1].clone(), fwd_shifted_alpha = F.pad(self.alpha[:, :-1].clone().to(alignment.device),
(1, 0, 0, 0)).to(inputs.device) (1, 0, 0, 0))
# compute transition potentials # compute transition potentials
alpha = (((1 - self.u) * self.alpha.clone().to(inputs.device) + alpha = ((1 - self.u) * self.alpha
self.u * prev_alpha) + 1e-8) * alignment + self.u * fwd_shifted_alpha
+ 1e-8) * alignment
# force incremental alignment # force incremental alignment
if not self.training and self.forward_attn_mask: if not self.training and self.forward_attn_mask:
_, n = prev_alpha.max(1) _, n = fwd_shifted_alpha.max(1)
val, n2 = alpha.max(1) val, n2 = alpha.max(1)
for b in range(alignment.shape[0]): for b in range(alignment.shape[0]):
alpha[b, n[b] + 3:] = 0 alpha[b, n[b] + 3:] = 0
@ -220,30 +221,24 @@ class Attention(nn.Module):
alpha[b, alpha[b,
(n[b] - 2 (n[b] - 2
)] = 0.01 * val[b] # smoothing factor for the prev step )] = 0.01 * val[b] # smoothing factor for the prev step
# compute attention weights # renormalize attention weights
self.alpha = alpha / alpha.sum(dim=1).unsqueeze(1) alpha = alpha / alpha.sum(dim=1, keepdim=True)
# compute context return alpha
context = torch.bmm(self.alpha.unsqueeze(1), inputs)
context = context.squeeze(1)
# compute transition agent
if self.trans_agent:
ta_input = torch.cat([context, query.squeeze(1)], dim=-1)
self.u = torch.sigmoid(self.ta(ta_input))
return context, self.alpha
def forward(self, attention_hidden_state, inputs, processed_inputs, mask): def forward(self, query, inputs, processed_inputs, mask):
if self.location_attention: if self.location_attention:
attention, processed_query = self.get_location_attention( attention, _ = self.get_location_attention(
attention_hidden_state, processed_inputs) query, processed_inputs)
else: else:
attention, processed_query = self.get_attention( attention, _ = self.get_attention(
attention_hidden_state, processed_inputs) query, processed_inputs)
# apply masking # apply masking
if mask is not None: if mask is not None:
attention.data.masked_fill_(1 - mask, self._mask_value) attention.data.masked_fill_(~mask, self._mask_value)
# apply windowing - only in eval mode # apply windowing - only in eval mode
if not self.training and self.windowing: if not self.training and self.windowing:
attention = self.apply_windowing(attention, inputs) attention = self.apply_windowing(attention, inputs)
# normalize attention values # normalize attention values
if self.norm == "softmax": if self.norm == "softmax":
alignment = torch.softmax(attention, dim=-1) alignment = torch.softmax(attention, dim=-1)
@ -252,15 +247,22 @@ class Attention(nn.Module):
attention).sum( attention).sum(
dim=1, keepdim=True) dim=1, keepdim=True)
else: else:
raise RuntimeError("Unknown value for attention norm type") raise ValueError("Unknown value for attention norm type")
if self.location_attention: if self.location_attention:
self.update_location_attention(alignment) self.update_location_attention(alignment)
# apply forward attention if enabled # apply forward attention if enabled
if self.forward_attn: if self.forward_attn:
context, self.attention_weights = self.apply_forward_attention( alignment = self.apply_forward_attention(alignment)
inputs, alignment, attention_hidden_state) self.alpha = alignment
else:
context = torch.bmm(alignment.unsqueeze(1), inputs) context = torch.bmm(alignment.unsqueeze(1), inputs)
context = context.squeeze(1) context = context.squeeze(1)
self.attention_weights = alignment self.attention_weights = alignment
# compute transition agent
if self.forward_attn and self.trans_agent:
ta_input = torch.cat([context, query.squeeze(1)], dim=-1)
self.u = torch.sigmoid(self.ta(ta_input))
return context return context

View File

@ -1,6 +1,6 @@
from torch import nn from torch import nn
from torch.nn import functional from torch.nn import functional
from utils.generic_utils import sequence_mask from TTS.utils.generic_utils import sequence_mask
class L1LossMasked(nn.Module): class L1LossMasked(nn.Module):

View File

@ -135,9 +135,6 @@ class CBHG(nn.Module):
]) ])
# max pooling of conv bank, with padding # max pooling of conv bank, with padding
# TODO: try average pooling OR larger kernel size # TODO: try average pooling OR larger kernel size
self.max_pool1d = nn.Sequential(
nn.ConstantPad1d([0, 1], value=0),
nn.MaxPool1d(kernel_size=2, stride=1, padding=0))
out_features = [K * conv_bank_features] + conv_projections[:-1] out_features = [K * conv_bank_features] + conv_projections[:-1]
activations = [self.relu] * (len(conv_projections) - 1) activations = [self.relu] * (len(conv_projections) - 1)
activations += [None] activations += [None]
@ -186,7 +183,6 @@ class CBHG(nn.Module):
outs.append(out) outs.append(out)
x = torch.cat(outs, dim=1) x = torch.cat(outs, dim=1)
assert x.size(1) == self.conv_bank_features * len(self.conv1d_banks) assert x.size(1) == self.conv_bank_features * len(self.conv1d_banks)
x = self.max_pool1d(x)
for conv1d in self.conv1d_projections: for conv1d in self.conv1d_projections:
x = conv1d(x) x = conv1d(x)
# (B, T_in, hid_feature) # (B, T_in, hid_feature)
@ -270,59 +266,57 @@ class Decoder(nn.Module):
memory_size (int): size of the past window. if <= 0 memory_size = r memory_size (int): size of the past window. if <= 0 memory_size = r
TODO: arguments TODO: arguments
""" """
# Pylint gets confused by PyTorch conventions here # Pylint gets confused by PyTorch conventions here
#pylint: disable=attribute-defined-outside-init #pylint: disable=attribute-defined-outside-init
def __init__(self, in_features, memory_dim, r, memory_size, attn_windowing, def __init__(self, in_features, memory_dim, r, memory_size, attn_windowing,
attn_norm, prenet_type, prenet_dropout, forward_attn, attn_norm, prenet_type, prenet_dropout, forward_attn,
trans_agent, forward_attn_mask, location_attn, separate_stopnet): trans_agent, forward_attn_mask, location_attn,
separate_stopnet):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.r_init = r
self.r = r self.r = r
self.in_features = in_features self.in_features = in_features
self.max_decoder_steps = 500 self.max_decoder_steps = 500
self.use_memory_queue = memory_size > 0
self.memory_size = memory_size if memory_size > 0 else r self.memory_size = memory_size if memory_size > 0 else r
self.memory_dim = memory_dim self.memory_dim = memory_dim
self.separate_stopnet = separate_stopnet self.separate_stopnet = separate_stopnet
self.query_dim = 256
# memory -> |Prenet| -> processed_memory # memory -> |Prenet| -> processed_memory
self.prenet = Prenet( self.prenet = Prenet(
memory_dim * self.memory_size, memory_dim * self.memory_size if self.use_memory_queue else memory_dim,
prenet_type, prenet_type,
prenet_dropout, prenet_dropout,
out_features=[256, 128]) out_features=[256, 128])
# processed_inputs, processed_memory -> |Attention| -> Attention, attention, RNN_State # processed_inputs, processed_memory -> |Attention| -> Attention, attention, RNN_State
self.attention_rnn = nn.GRUCell(in_features + 128, 256) # attention_rnn generates queries for the attention mechanism
self.attention_layer = Attention(attention_rnn_dim=256, self.attention_rnn = nn.GRUCell(in_features + 128, self.query_dim)
embedding_dim=in_features,
attention_dim=128, self.attention = Attention(query_dim=self.query_dim,
location_attention=location_attn, embedding_dim=in_features,
attention_location_n_filters=32, attention_dim=128,
attention_location_kernel_size=31, location_attention=location_attn,
windowing=attn_windowing, attention_location_n_filters=32,
norm=attn_norm, attention_location_kernel_size=31,
forward_attn=forward_attn, windowing=attn_windowing,
trans_agent=trans_agent, norm=attn_norm,
forward_attn_mask=forward_attn_mask) forward_attn=forward_attn,
trans_agent=trans_agent,
forward_attn_mask=forward_attn_mask)
# (processed_memory | attention context) -> |Linear| -> decoder_RNN_input # (processed_memory | attention context) -> |Linear| -> decoder_RNN_input
self.project_to_decoder_in = nn.Linear(256 + in_features, 256) self.project_to_decoder_in = nn.Linear(256 + in_features, 256)
# decoder_RNN_input -> |RNN| -> RNN_state # decoder_RNN_input -> |RNN| -> RNN_state
self.decoder_rnns = nn.ModuleList( self.decoder_rnns = nn.ModuleList(
[nn.GRUCell(256, 256) for _ in range(2)]) [nn.GRUCell(256, 256) for _ in range(2)])
# RNN_state -> |Linear| -> mel_spec # RNN_state -> |Linear| -> mel_spec
self.proj_to_mel = nn.Linear(256, memory_dim * r) self.proj_to_mel = nn.Linear(256, memory_dim * self.r_init)
# learn init values instead of zero init. # learn init values instead of zero init.
self.attention_rnn_init = nn.Embedding(1, 256) self.stopnet = StopNet(256 + memory_dim * self.r_init)
self.memory_init = nn.Embedding(1, self.memory_size * memory_dim)
self.decoder_rnn_inits = nn.Embedding(2, 256)
self.stopnet = StopNet(256 + memory_dim * r)
# self.init_layers()
def init_layers(self): def set_r(self, new_r):
torch.nn.init.xavier_uniform_( self.r = new_r
self.project_to_decoder_in.weight,
gain=torch.nn.init.calculate_gain('linear'))
torch.nn.init.xavier_uniform_(
self.proj_to_mel.weight,
gain=torch.nn.init.calculate_gain('linear'))
def _reshape_memory(self, memory): def _reshape_memory(self, memory):
""" """
@ -344,21 +338,19 @@ class Decoder(nn.Module):
B = inputs.size(0) B = inputs.size(0)
T = inputs.size(1) T = inputs.size(1)
# go frame as zeros matrix # go frame as zeros matrix
self.memory_input = self.memory_init(inputs.data.new_zeros(B).long()) if self.use_memory_queue:
self.memory_input = torch.zeros(B, self.memory_dim * self.memory_size, device=inputs.device)
else:
self.memory_input = torch.zeros(B, self.memory_dim, device=inputs.device)
# decoder states # decoder states
self.attention_rnn_hidden = self.attention_rnn_init( self.attention_rnn_hidden = torch.zeros(B, 256, device=inputs.device)
inputs.data.new_zeros(B).long())
self.decoder_rnn_hiddens = [ self.decoder_rnn_hiddens = [
self.decoder_rnn_inits(inputs.data.new_tensor([idx] * B).long()) torch.zeros(B, 256, device=inputs.device)
for idx in range(len(self.decoder_rnns)) for idx in range(len(self.decoder_rnns))
] ]
self.current_context_vec = inputs.data.new(B, self.in_features).zero_() self.context_vec = inputs.data.new(B, self.in_features).zero_()
# attention states
self.attention = inputs.data.new(B, T).zero_()
self.attention_cum = inputs.data.new(B, T).zero_()
# cache attention inputs # cache attention inputs
self.processed_inputs = self.attention_layer.inputs_layer(inputs) self.processed_inputs = self.attention.inputs_layer(inputs)
def _parse_outputs(self, outputs, attentions, stop_tokens): def _parse_outputs(self, outputs, attentions, stop_tokens):
# Back to batch first # Back to batch first
@ -371,12 +363,15 @@ class Decoder(nn.Module):
# Prenet # Prenet
processed_memory = self.prenet(self.memory_input) processed_memory = self.prenet(self.memory_input)
# Attention RNN # Attention RNN
self.attention_rnn_hidden = self.attention_rnn(torch.cat((processed_memory, self.current_context_vec), -1), self.attention_rnn_hidden) self.attention_rnn_hidden = self.attention_rnn(
self.current_context_vec = self.attention_layer(self.attention_rnn_hidden, inputs, self.processed_inputs, mask) torch.cat((processed_memory, self.context_vec), -1),
self.attention_rnn_hidden)
self.context_vec = self.attention(
self.attention_rnn_hidden, inputs, self.processed_inputs, mask)
# Concat RNN output and attention context vector # Concat RNN output and attention context vector
decoder_input = self.project_to_decoder_in( decoder_input = self.project_to_decoder_in(
torch.cat((self.attention_rnn_hidden, self.current_context_vec), torch.cat((self.attention_rnn_hidden, self.context_vec), -1))
-1))
# Pass through the decoder RNNs # Pass through the decoder RNNs
for idx in range(len(self.decoder_rnns)): for idx in range(len(self.decoder_rnns)):
self.decoder_rnn_hiddens[idx] = self.decoder_rnns[idx]( self.decoder_rnn_hiddens[idx] = self.decoder_rnns[idx](
@ -384,28 +379,33 @@ class Decoder(nn.Module):
# Residual connection # Residual connection
decoder_input = self.decoder_rnn_hiddens[idx] + decoder_input decoder_input = self.decoder_rnn_hiddens[idx] + decoder_input
decoder_output = decoder_input decoder_output = decoder_input
del decoder_input
# predict mel vectors from decoder vectors # predict mel vectors from decoder vectors
output = self.proj_to_mel(decoder_output) output = self.proj_to_mel(decoder_output)
output = torch.sigmoid(output) # output = torch.sigmoid(output)
# predict stop token # predict stop token
stopnet_input = torch.cat([decoder_output, output], -1) stopnet_input = torch.cat([decoder_output, output], -1)
del decoder_output
if self.separate_stopnet: if self.separate_stopnet:
stop_token = self.stopnet(stopnet_input.detach()) stop_token = self.stopnet(stopnet_input.detach())
else: else:
stop_token = self.stopnet(stopnet_input) stop_token = self.stopnet(stopnet_input)
return output, stop_token, self.attention_layer.attention_weights output = output[:, : self.r * self.memory_dim]
return output, stop_token, self.attention.attention_weights
def _update_memory_queue(self, new_memory): def _update_memory_input(self, new_memory):
if self.memory_size > 0 and new_memory.shape[-1] < self.memory_size: if self.use_memory_queue:
self.memory_input = torch.cat([ if self.memory_size > self.r:
self.memory_input[:, self.r * self.memory_dim:].clone(), # memory queue size is larger than number of frames per decoder iter
new_memory self.memory_input = torch.cat([
], new_memory, self.memory_input[:, :(
dim=-1) self.memory_size - self.r) * self.memory_dim].clone()
], dim=-1)
else:
# memory queue size smaller than number of frames per decoder iter
self.memory_input = new_memory[:, :self.memory_size * self.memory_dim]
else: else:
self.memory_input = new_memory # use only the last frame prediction
self.memory_input = new_memory[:, :self.memory_dim]
def forward(self, inputs, memory, mask): def forward(self, inputs, memory, mask):
""" """
@ -427,11 +427,11 @@ class Decoder(nn.Module):
stop_tokens = [] stop_tokens = []
t = 0 t = 0
self._init_states(inputs) self._init_states(inputs)
self.attention_layer.init_states(inputs) self.attention.init_states(inputs)
while len(outputs) < memory.size(0): while len(outputs) < memory.size(0):
if t > 0: if t > 0:
new_memory = memory[t - 1] new_memory = memory[t - 1]
self._update_memory_queue(new_memory) self._update_memory_input(new_memory)
output, stop_token, attention = self.decode(inputs, mask) output, stop_token, attention = self.decode(inputs, mask)
outputs += [output] outputs += [output]
attentions += [attention] attentions += [attention]
@ -453,12 +453,12 @@ class Decoder(nn.Module):
stop_tokens = [] stop_tokens = []
t = 0 t = 0
self._init_states(inputs) self._init_states(inputs)
self.attention_layer.init_win_idx() self.attention.init_win_idx()
self.attention_layer.init_states(inputs) self.attention.init_states(inputs)
while True: while True:
if t > 0: if t > 0:
new_memory = outputs[-1] new_memory = outputs[-1]
self._update_memory_queue(new_memory) self._update_memory_input(new_memory)
output, stop_token, attention = self.decode(inputs, None) output, stop_token, attention = self.decode(inputs, None)
stop_token = torch.sigmoid(stop_token.data) stop_token = torch.sigmoid(stop_token.data)
outputs += [output] outputs += [output]

View File

@ -104,7 +104,7 @@ class Decoder(nn.Module):
self.r = r self.r = r
self.encoder_embedding_dim = in_features self.encoder_embedding_dim = in_features
self.separate_stopnet = separate_stopnet self.separate_stopnet = separate_stopnet
self.attention_rnn_dim = 1024 self.query_dim = 1024
self.decoder_rnn_dim = 1024 self.decoder_rnn_dim = 1024
self.prenet_dim = 256 self.prenet_dim = 256
self.max_decoder_steps = 1000 self.max_decoder_steps = 1000
@ -117,21 +117,21 @@ class Decoder(nn.Module):
[self.prenet_dim, self.prenet_dim], bias=False) [self.prenet_dim, self.prenet_dim], bias=False)
self.attention_rnn = nn.LSTMCell(self.prenet_dim + in_features, self.attention_rnn = nn.LSTMCell(self.prenet_dim + in_features,
self.attention_rnn_dim) self.query_dim)
self.attention_layer = Attention(attention_rnn_dim=self.attention_rnn_dim, self.attention = Attention(query_dim=self.query_dim,
embedding_dim=in_features, embedding_dim=in_features,
attention_dim=128, attention_dim=128,
location_attention=location_attn, location_attention=location_attn,
attention_location_n_filters=32, attention_location_n_filters=32,
attention_location_kernel_size=31, attention_location_kernel_size=31,
windowing=attn_win, windowing=attn_win,
norm=attn_norm, norm=attn_norm,
forward_attn=forward_attn, forward_attn=forward_attn,
trans_agent=trans_agent, trans_agent=trans_agent,
forward_attn_mask=forward_attn_mask) forward_attn_mask=forward_attn_mask)
self.decoder_rnn = nn.LSTMCell(self.attention_rnn_dim + in_features, self.decoder_rnn = nn.LSTMCell(self.query_dim + in_features,
self.decoder_rnn_dim, 1) self.decoder_rnn_dim, 1)
self.linear_projection = Linear(self.decoder_rnn_dim + in_features, self.linear_projection = Linear(self.decoder_rnn_dim + in_features,
@ -145,7 +145,7 @@ class Decoder(nn.Module):
bias=True, bias=True,
init_gain='sigmoid')) init_gain='sigmoid'))
self.attention_rnn_init = nn.Embedding(1, self.attention_rnn_dim) self.attention_rnn_init = nn.Embedding(1, self.query_dim)
self.go_frame_init = nn.Embedding(1, self.mel_channels * r) self.go_frame_init = nn.Embedding(1, self.mel_channels * r)
self.decoder_rnn_inits = nn.Embedding(1, self.decoder_rnn_dim) self.decoder_rnn_inits = nn.Embedding(1, self.decoder_rnn_dim)
self.memory_truncated = None self.memory_truncated = None
@ -160,10 +160,10 @@ class Decoder(nn.Module):
# T = inputs.size(1) # T = inputs.size(1)
if not keep_states: if not keep_states:
self.attention_hidden = self.attention_rnn_init( self.query = self.attention_rnn_init(
inputs.data.new_zeros(B).long()) inputs.data.new_zeros(B).long())
self.attention_cell = Variable( self.attention_rnn_cell_state = Variable(
inputs.data.new(B, self.attention_rnn_dim).zero_()) inputs.data.new(B, self.query_dim).zero_())
self.decoder_hidden = self.decoder_rnn_inits( self.decoder_hidden = self.decoder_rnn_inits(
inputs.data.new_zeros(B).long()) inputs.data.new_zeros(B).long())
@ -174,7 +174,7 @@ class Decoder(nn.Module):
inputs.data.new(B, self.encoder_embedding_dim).zero_()) inputs.data.new(B, self.encoder_embedding_dim).zero_())
self.inputs = inputs self.inputs = inputs
self.processed_inputs = self.attention_layer.inputs_layer(inputs) self.processed_inputs = self.attention.inputs_layer(inputs)
self.mask = mask self.mask = mask
def _reshape_memory(self, memories): def _reshape_memory(self, memories):
@ -193,18 +193,18 @@ class Decoder(nn.Module):
return outputs, stop_tokens, alignments return outputs, stop_tokens, alignments
def decode(self, memory): def decode(self, memory):
cell_input = torch.cat((memory, self.context), -1) query_input = torch.cat((memory, self.context), -1)
self.attention_hidden, self.attention_cell = self.attention_rnn( self.query, self.attention_rnn_cell_state = self.attention_rnn(
cell_input, (self.attention_hidden, self.attention_cell)) query_input, (self.query, self.attention_rnn_cell_state))
self.attention_hidden = F.dropout( self.query = F.dropout(
self.attention_hidden, self.p_attention_dropout, self.training) self.query, self.p_attention_dropout, self.training)
self.attention_cell = F.dropout( self.attention_rnn_cell_state = F.dropout(
self.attention_cell, self.p_attention_dropout, self.training) self.attention_rnn_cell_state, self.p_attention_dropout, self.training)
self.context = self.attention_layer(self.attention_hidden, self.inputs, self.context = self.attention(self.query, self.inputs,
self.processed_inputs, self.mask) self.processed_inputs, self.mask)
memory = torch.cat((self.attention_hidden, self.context), -1) memory = torch.cat((self.query, self.context), -1)
self.decoder_hidden, self.decoder_cell = self.decoder_rnn( self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
memory, (self.decoder_hidden, self.decoder_cell)) memory, (self.decoder_hidden, self.decoder_cell))
self.decoder_hidden = F.dropout(self.decoder_hidden, self.decoder_hidden = F.dropout(self.decoder_hidden,
@ -223,7 +223,7 @@ class Decoder(nn.Module):
stop_token = self.stopnet(stopnet_input.detach()) stop_token = self.stopnet(stopnet_input.detach())
else: else:
stop_token = self.stopnet(stopnet_input) stop_token = self.stopnet(stopnet_input)
return decoder_output, stop_token, self.attention_layer.attention_weights return decoder_output, stop_token, self.attention.attention_weights
def forward(self, inputs, memories, mask): def forward(self, inputs, memories, mask):
memory = self.get_go_frame(inputs).unsqueeze(0) memory = self.get_go_frame(inputs).unsqueeze(0)
@ -232,7 +232,7 @@ class Decoder(nn.Module):
memories = self.prenet(memories) memories = self.prenet(memories)
self._init_states(inputs, mask=mask) self._init_states(inputs, mask=mask)
self.attention_layer.init_states(inputs) self.attention.init_states(inputs)
outputs, stop_tokens, alignments = [], [], [] outputs, stop_tokens, alignments = [], [], []
while len(outputs) < memories.size(0) - 1: while len(outputs) < memories.size(0) - 1:
@ -251,8 +251,8 @@ class Decoder(nn.Module):
memory = self.get_go_frame(inputs) memory = self.get_go_frame(inputs)
self._init_states(inputs, mask=None) self._init_states(inputs, mask=None)
self.attention_layer.init_win_idx() self.attention.init_win_idx()
self.attention_layer.init_states(inputs) self.attention.init_states(inputs)
outputs, stop_tokens, alignments, t = [], [], [], 0 outputs, stop_tokens, alignments, t = [], [], [], 0
stop_flags = [True, False, False] stop_flags = [True, False, False]
@ -295,8 +295,8 @@ class Decoder(nn.Module):
else: else:
self._init_states(inputs, mask=None, keep_states=True) self._init_states(inputs, mask=None, keep_states=True)
self.attention_layer.init_win_idx() self.attention.init_win_idx()
self.attention_layer.init_states(inputs) self.attention.init_states(inputs)
outputs, stop_tokens, alignments, t = [], [], [], 0 outputs, stop_tokens, alignments, t = [], [], [], 0
stop_flags = [True, False, False] stop_flags = [True, False, False]
stop_count = 0 stop_count = 0

View File

@ -1,7 +1,7 @@
# coding: utf-8 # coding: utf-8
from torch import nn from torch import nn
from layers.tacotron import Encoder, Decoder, PostCBHG from TTS.layers.tacotron import Encoder, Decoder, PostCBHG
from utils.generic_utils import sequence_mask from TTS.utils.generic_utils import sequence_mask
class Tacotron(nn.Module): class Tacotron(nn.Module):
@ -36,10 +36,8 @@ class Tacotron(nn.Module):
forward_attn, trans_agent, forward_attn_mask, forward_attn, trans_agent, forward_attn_mask,
location_attn, separate_stopnet) location_attn, separate_stopnet)
self.postnet = PostCBHG(mel_dim) self.postnet = PostCBHG(mel_dim)
self.last_linear = nn.Sequential( self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim)
nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim),
nn.Sigmoid())
def forward(self, characters, text_lengths, mel_specs, speaker_ids=None): def forward(self, characters, text_lengths, mel_specs, speaker_ids=None):
B = characters.size(0) B = characters.size(0)
mask = sequence_mask(text_lengths).to(characters.device) mask = sequence_mask(text_lengths).to(characters.device)

View File

@ -1,7 +1,7 @@
from math import sqrt from math import sqrt
from torch import nn from torch import nn
from layers.tacotron2 import Encoder, Decoder, Postnet from TTS.layers.tacotron2 import Encoder, Decoder, Postnet
from utils.generic_utils import sequence_mask from TTS.utils.generic_utils import sequence_mask
# TODO: match function arguments with tacotron # TODO: match function arguments with tacotron

View File

@ -1,8 +1,8 @@
# coding: utf-8 # coding: utf-8
from torch import nn from torch import nn
from layers.tacotron import Encoder, Decoder, PostCBHG from TTS.layers.tacotron import Encoder, Decoder, PostCBHG
from layers.gst_layers import GST from TTS.layers.gst_layers import GST
from utils.generic_utils import sequence_mask from TTS.utils.generic_utils import sequence_mask
class TacotronGST(nn.Module): class TacotronGST(nn.Module):
@ -38,9 +38,8 @@ class TacotronGST(nn.Module):
forward_attn, trans_agent, forward_attn_mask, forward_attn, trans_agent, forward_attn_mask,
location_attn, separate_stopnet) location_attn, separate_stopnet)
self.postnet = PostCBHG(mel_dim) self.postnet = PostCBHG(mel_dim)
self.last_linear = nn.Sequential( self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim)
nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim),
nn.Sigmoid())
def forward(self, characters, text_lengths, mel_specs, speaker_ids=None): def forward(self, characters, text_lengths, mel_specs, speaker_ids=None):
B = characters.size(0) B = characters.size(0)

View File

@ -19,10 +19,8 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 1,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"TTS_PATH = \"/home/erogol/projects/\"\n", "TTS_PATH = \"/home/erogol/projects/\"\n",
@ -31,12 +29,28 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 2,
"metadata": { "metadata": {
"collapsed": true,
"scrolled": true "scrolled": true
}, },
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/erogol/miniconda3/lib/python3.7/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['plt']\n",
"`%matplotlib` prevents importing * from pylab and numpy\n",
" \"\\n`%matplotlib` prevents importing * from pylab and numpy\"\n"
]
}
],
"source": [ "source": [
"%load_ext autoreload\n", "%load_ext autoreload\n",
"%autoreload 2\n", "%autoreload 2\n",
@ -78,10 +92,8 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 3,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"def tts(model, text, CONFIG, use_cuda, ap, use_gl, speaker_id=None, figures=True):\n", "def tts(model, text, CONFIG, use_cuda, ap, use_gl, speaker_id=None, figures=True):\n",
@ -105,14 +117,25 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 9,
"metadata": { "metadata": {},
"collapsed": true "outputs": [
}, {
"outputs": [], "ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: '/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-9-3306702a6bbc>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mVOCODER_MODEL_PATH\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/model_checkpoints/best_model.pth.tar\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mVOCODER_CONFIG_PATH\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mVOCODER_CONFIG\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mVOCODER_CONFIG_PATH\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0muse_cuda\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/projects/TTS/tts_namespace/TTS/utils/generic_utils.py\u001b[0m in \u001b[0;36mload_config\u001b[0;34m(config_path)\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mconfig\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAttrDict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 22\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 23\u001b[0m \u001b[0minput_str\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0minput_str\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msub\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'\\\\\\n'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_str\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json'"
]
}
],
"source": [ "source": [
"# Set constants\n", "# Set constants\n",
"ROOT_PATH = '/media/erogol/data_ssd/Data/models/mozilla_models/4845/'\n", "ROOT_PATH = '/media/erogol/data_ssd/Models/libri_tts/5049/'\n",
"MODEL_PATH = ROOT_PATH + 'best_model.pth.tar'\n", "MODEL_PATH = ROOT_PATH + 'best_model.pth.tar'\n",
"CONFIG_PATH = ROOT_PATH + '/config.json'\n", "CONFIG_PATH = ROOT_PATH + '/config.json'\n",
"OUT_FOLDER = \"/home/erogol/Dropbox/AudioSamples/benchmark_samples/\"\n", "OUT_FOLDER = \"/home/erogol/Dropbox/AudioSamples/benchmark_samples/\"\n",
@ -136,9 +159,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"# LOAD TTS MODEL\n", "# LOAD TTS MODEL\n",
@ -169,9 +190,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"# LOAD WAVERNN\n", "# LOAD WAVERNN\n",
@ -211,12 +230,21 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 5,
"metadata": { "metadata": {},
"collapsed": true, "outputs": [
"scrolled": false {
}, "ename": "NameError",
"outputs": [], "evalue": "name 'model' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-5-e285d5bde9fb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0meval\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_decoder_steps\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m2000\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mspeaker_id\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Bill got in the habit of asking himself “Is that thought true?” And if he wasnt absolutely certain it was, he just let it go.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
]
}
],
"source": [ "source": [
"model.eval()\n", "model.eval()\n",
"model.decoder.max_decoder_steps = 2000\n", "model.decoder.max_decoder_steps = 2000\n",
@ -227,12 +255,23 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 6,
"metadata": { "metadata": {
"collapsed": true,
"scrolled": true "scrolled": true
}, },
"outputs": [], "outputs": [
{
"ename": "NameError",
"evalue": "name 'model' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-6-621056ffa667>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Be a voice, not an echo.\"\u001b[0m \u001b[0;31m# 'echo' is not in training set.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
]
}
],
"source": [ "source": [
"sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n", "sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
@ -240,11 +279,21 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 7,
"metadata": { "metadata": {},
"collapsed": true "outputs": [
}, {
"outputs": [], "ename": "NameError",
"evalue": "name 'model' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-7-26967668a1a1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"The human voice is the most perfect instrument of all.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
]
}
],
"source": [ "source": [
"sentence = \"The human voice is the most perfect instrument of all.\"\n", "sentence = \"The human voice is the most perfect instrument of all.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
@ -252,11 +301,21 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 8,
"metadata": { "metadata": {},
"collapsed": true "outputs": [
}, {
"outputs": [], "ename": "NameError",
"evalue": "name 'model' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-8-28cb5023e353>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"I'm sorry Dave. I'm afraid I can't do that.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
]
}
],
"source": [ "source": [
"sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n", "sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)"
@ -267,6 +326,9 @@
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true, "collapsed": true,
"jupyter": {
"outputs_hidden": true
},
"scrolled": true "scrolled": true
}, },
"outputs": [], "outputs": [],
@ -286,7 +348,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"jupyter": {
"outputs_hidden": true
}
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -298,7 +363,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"jupyter": {
"outputs_hidden": true
}
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -310,7 +378,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"jupyter": {
"outputs_hidden": true
}
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -322,7 +393,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"jupyter": {
"outputs_hidden": true
}
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -334,7 +408,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"jupyter": {
"outputs_hidden": true
}
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -353,7 +430,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"jupyter": {
"outputs_hidden": true
}
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -365,7 +445,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"jupyter": {
"outputs_hidden": true
}
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -377,7 +460,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"jupyter": {
"outputs_hidden": true
}
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -389,7 +475,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"jupyter": {
"outputs_hidden": true
}
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -402,7 +491,9 @@
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true, "collapsed": true,
"scrolled": false "jupyter": {
"outputs_hidden": true
}
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -415,7 +506,9 @@
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true, "collapsed": true,
"scrolled": false "jupyter": {
"outputs_hidden": true
}
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -427,7 +520,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"jupyter": {
"outputs_hidden": true
}
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -439,7 +535,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"jupyter": {
"outputs_hidden": true
}
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -451,7 +550,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"jupyter": {
"outputs_hidden": true
}
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -462,9 +564,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"sentence = \"Eren, how are you?\"\n", "sentence = \"Eren, how are you?\"\n",
@ -482,7 +582,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"jupyter": {
"outputs_hidden": true
}
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -494,7 +597,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"jupyter": {
"outputs_hidden": true
}
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -506,7 +612,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"jupyter": {
"outputs_hidden": true
}
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -518,7 +627,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"jupyter": {
"outputs_hidden": true
}
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -531,6 +643,9 @@
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true, "collapsed": true,
"jupyter": {
"outputs_hidden": true
},
"scrolled": true "scrolled": true
}, },
"outputs": [], "outputs": [],
@ -543,7 +658,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"jupyter": {
"outputs_hidden": true
}
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -556,7 +674,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"jupyter": {
"outputs_hidden": true
}
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -566,9 +687,9 @@
], ],
"metadata": { "metadata": {
"kernelspec": { "kernelspec": {
"display_name": "Python 3(mztts)", "display_name": "Python 3",
"language": "python", "language": "python",
"name": "mztts" "name": "python3"
}, },
"language_info": { "language_info": {
"codemirror_mode": { "codemirror_mode": {
@ -580,9 +701,9 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.6.8" "version": "3.7.3"
} }
}, },
"nbformat": 4, "nbformat": 4,
"nbformat_minor": 2 "nbformat_minor": 4
} }

View File

@ -105,10 +105,10 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from utils.text.symbols import symbols, phonemes\n", "from TTS.utils.text.symbols import symbols, phonemes\n",
"from utils.generic_utils import sequence_mask\n", "from TTS.utils.generic_utils import sequence_mask\n",
"from layers.losses import L1LossMasked\n", "from TTS.layers.losses import L1LossMasked\n",
"from utils.text.symbols import symbols, phonemes\n", "from TTS.utils.text.symbols import symbols, phonemes\n",
"\n", "\n",
"# load the model\n", "# load the model\n",
"num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n", "num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n",

0
server/__init__.py Normal file
View File

View File

@ -1,12 +1,12 @@
{ {
"tts_path":"/media/erogol/data_ssd/Models/libri_tts/ljspeech-July-22-2019_10+45AM-ee706b5/", // tts model root folder "tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder
"tts_file":"best_model.pth.tar", // tts checkpoint file "tts_file":"best_model.pth.tar", // tts checkpoint file
"tts_config":"config.json", // tts config.json file "tts_config":"config.json", // tts config.json file
"tts_speakers": null, // json file listing speaker ids. null if no speaker embedding. "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding.
"wavernn_lib_path": "/home/erogol/projects/", // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis. "wavernn_lib_path": null, // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis.
"wavernn_path":"/media/erogol/data_ssd/Models/wavernn/universal/4910/", // wavernn model root path "wavernn_path":null, // wavernn model root path
"wavernn_file":"best_model_16K.pth.tar", // wavernn checkpoint file name "wavernn_file":null, // wavernn checkpoint file name
"wavernn_config":"config_16K.json", // wavernn config file "wavernn_config": null, // wavernn config file
"is_wavernn_batched":true, "is_wavernn_batched":true,
"port": 5002, "port": 5002,
"use_cuda": true, "use_cuda": true,

View File

@ -1,7 +1,7 @@
#!flask/bin/python #!flask/bin/python
import argparse import argparse
from synthesizer import Synthesizer from synthesizer import Synthesizer
from utils.generic_utils import load_config from TTS.utils.generic_utils import load_config
from flask import Flask, request, render_template, send_file from flask import Flask, request, render_template, send_file
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()

View File

@ -5,10 +5,11 @@ import numpy as np
import torch import torch
import sys import sys
from utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
from utils.generic_utils import load_config, setup_model from TTS.utils.generic_utils import load_config, setup_model
from utils.text import phoneme_to_sequence, phonemes, symbols, text_to_sequence, sequence_to_phoneme from TTS.utils.text import phonemes, symbols
from utils.speakers import load_speaker_mapping from TTS.utils.speakers import load_speaker_mapping
from TTS.utils.synthesis import *
import re import re
alphabets = r"([A-Za-z])" alphabets = r"([A-Za-z])"
@ -41,28 +42,25 @@ class Synthesizer(object):
self.ap = AudioProcessor(**self.tts_config.audio) self.ap = AudioProcessor(**self.tts_config.audio)
if self.use_phonemes: if self.use_phonemes:
self.input_size = len(phonemes) self.input_size = len(phonemes)
self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.tts_config.text_cleaner], self.tts_config.phoneme_language, self.tts_config.enable_eos_bos_chars)
else: else:
self.input_size = len(symbols) self.input_size = len(symbols)
self.input_adapter = lambda sen: text_to_sequence(sen, [self.tts_config.text_cleaner])
# load speakers # load speakers
if self.config.tts_speakers is not None: if self.config.tts_speakers is not None:
self.tts_speakers = load_speaker_mapping(os.path.join(model_path, self.config.tts_speakers)) self.tts_speakers = load_speaker_mapping(os.path.join(model_path, self.config.tts_speakers))
num_speakers = len(self.tts_speakers) num_speakers = len(self.tts_speakers)
else: else:
num_speakers = 0 num_speakers = 0
self.tts_model = setup_model(self.input_size, num_speakers=num_speakers , c=self.tts_config) self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config)
# load model state # load model state
if use_cuda: cp = torch.load(self.model_file)
cp = torch.load(self.model_file)
else:
cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
# load the model # load the model
self.tts_model.load_state_dict(cp['model']) self.tts_model.load_state_dict(cp['model'])
if use_cuda: if use_cuda:
self.tts_model.cuda() self.tts_model.cuda()
self.tts_model.eval() self.tts_model.eval()
self.tts_model.decoder.max_decoder_steps = 3000 self.tts_model.decoder.max_decoder_steps = 3000
if 'r' in cp and self.tts_config.model in ["Tacotron", "TacotronGST"]:
self.tts_model.decoder.set_r(cp['r'])
def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda):
# TODO: set a function in wavernn code base for model setup and call it here. # TODO: set a function in wavernn code base for model setup and call it here.
@ -136,32 +134,27 @@ class Synthesizer(object):
def tts(self, text): def tts(self, text):
wavs = [] wavs = []
sens = self.split_into_sentences(text) sens = self.split_into_sentences(text)
print(sens)
if not sens: if not sens:
sens = [text+'.'] sens = [text+'.']
for sen in sens: for sen in sens:
if len(sen) < 3: # preprocess the given text
continue inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda)
sen = sen.strip() # synthesize voice
print(sen) decoder_output, postnet_output, alignments, _ = run_model(
self.tts_model, inputs, self.tts_config, False, None, None)
# convert outputs to numpy
postnet_output, decoder_output, _ = parse_outputs(
postnet_output, decoder_output, alignments)
seq = np.array(self.input_adapter(sen)) if self.wavernn:
text_hat = sequence_to_phoneme(seq) postnet_output = postnet_output[0].data.cpu().numpy()
print(text_hat) wav = self.wavernn.generate(torch.FloatTensor(postnet_output.T).unsqueeze(0).cuda(), batched=self.config.is_wavernn_batched, target=11000, overlap=550)
else:
wav = inv_spectrogram(postnet_output, self.ap, self.tts_config)
# trim silence
wav = trim_silence(wav, self.ap)
chars_var = torch.from_numpy(seq).unsqueeze(0).long()
if self.use_cuda:
chars_var = chars_var.cuda()
decoder_out, postnet_out, alignments, stop_tokens = self.tts_model.inference(
chars_var)
postnet_out = postnet_out[0].data.cpu().numpy()
if self.tts_config.model == "Tacotron":
wav = self.ap.inv_spectrogram(postnet_out.T)
elif self.tts_config.model == "Tacotron2":
if self.wavernn:
wav = self.wavernn.generate(torch.FloatTensor(postnet_out.T).unsqueeze(0).cuda(), batched=self.config.is_wavernn_batched, target=11000, overlap=550)
else:
wav = self.ap.inv_mel_spectrogram(postnet_out.T)
wavs += list(wav) wavs += list(wav)
wavs += [0] * 10000 wavs += [0] * 10000

View File

@ -62,7 +62,15 @@ setup(
version=version, version=version,
url='https://github.com/mozilla/TTS', url='https://github.com/mozilla/TTS',
description='Text to Speech with Deep Learning', description='Text to Speech with Deep Learning',
packages=find_packages(), license='MPL-2.0',
package_dir={'': 'tts_namespace'},
packages=find_packages('tts_namespace'),
project_urls={
'Documentation': 'https://github.com/mozilla/TTS/wiki',
'Tracker': 'https://github.com/mozilla/TTS/issues',
'Repository': 'https://github.com/mozilla/TTS',
'Discussions': 'https://discourse.mozilla.org/c/tts',
},
cmdclass={ cmdclass={
'build_py': build_py, 'build_py': build_py,
'develop': develop, 'develop': develop,
@ -79,14 +87,10 @@ setup(
"flask", "flask",
# "lws", # "lws",
"tqdm", "tqdm",
"phonemizer",
"soundfile", "soundfile",
"phonemizer @ https://github.com/bootphon/phonemizer/tarball/master",
], ],
dependency_links=[ dependency_links=[
'http://github.com/bootphon/phonemizer/tarball/master#egg=phonemizer' "http://github.com/bootphon/phonemizer/tarball/master#egg=phonemizer-1.0.1"
], ]
extras_require={ )
"bin": [
"requests",
],
})

View File

@ -4,10 +4,10 @@ import argparse
import torch import torch
import string import string
from utils.synthesis import synthesis from TTS.utils.synthesis import synthesis
from utils.generic_utils import load_config, setup_model from TTS.utils.generic_utils import load_config, setup_model
from utils.text.symbols import symbols, phonemes from TTS.utils.text.symbols import symbols, phonemes
from utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
def tts(model, def tts(model,

View File

@ -1,8 +1,8 @@
import unittest import unittest
import torch as T import torch as T
from utils.generic_utils import save_checkpoint, save_best_model from TTS.utils.generic_utils import save_checkpoint, save_best_model
from layers.tacotron import Prenet from TTS.layers.tacotron import Prenet
OUT_PATH = '/tmp/test.pth.tar' OUT_PATH = '/tmp/test.pth.tar'

View File

@ -1,5 +1,5 @@
{ {
"tts_path":"tests/outputs/", // tts model root folder "tts_path":"TTS/tests/outputs/", // tts model root folder
"tts_file":"checkpoint_10.pth.tar", // tts checkpoint file "tts_file":"checkpoint_10.pth.tar", // tts checkpoint file
"tts_config":"dummy_model_config.json", // tts config.json file "tts_config":"dummy_model_config.json", // tts config.json file
"tts_speakers": null, // json file listing speaker ids. null if no speaker embedding. "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding.

View File

@ -1,7 +1,8 @@
import unittest import unittest
from utils.text import phonemes from TTS.utils.text import phonemes
class SymbolsTest(unittest.TestCase): class SymbolsTest(unittest.TestCase):
def test_uniqueness(self): def test_uniqueness(self): #pylint: disable=no-self-use
assert sorted(phonemes) == sorted(list(set(phonemes))) assert sorted(phonemes) == sorted(list(set(phonemes))), " {} vs {} ".format(len(phonemes), len(set(phonemes)))

View File

@ -1,9 +1,9 @@
import os import os
import unittest import unittest
from tests import get_tests_path, get_tests_input_path, get_tests_output_path from TTS.tests import get_tests_path, get_tests_input_path, get_tests_output_path
from utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
from utils.generic_utils import load_config from TTS.utils.generic_utils import load_config
TESTS_PATH = get_tests_path() TESTS_PATH = get_tests_path()
OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")

View File

@ -3,10 +3,10 @@ import unittest
import torch as T import torch as T
from server.synthesizer import Synthesizer from TTS.server.synthesizer import Synthesizer
from tests import get_tests_input_path, get_tests_output_path, get_tests_path from TTS.tests import get_tests_input_path, get_tests_output_path
from utils.text.symbols import phonemes, symbols from TTS.utils.text.symbols import phonemes, symbols
from utils.generic_utils import load_config, save_checkpoint, setup_model from TTS.utils.generic_utils import load_config, save_checkpoint, setup_model
class DemoServerTest(unittest.TestCase): class DemoServerTest(unittest.TestCase):
@ -20,5 +20,6 @@ class DemoServerTest(unittest.TestCase):
def test_in_out(self): def test_in_out(self):
self._create_random_model() self._create_random_model()
config = load_config(os.path.join(get_tests_input_path(), 'server_config.json')) config = load_config(os.path.join(get_tests_input_path(), 'server_config.json'))
config['tts_path'] = get_tests_output_path()
synthesizer = Synthesizer(config) synthesizer = Synthesizer(config)
synthesizer.tts("Better this test works!!") synthesizer.tts("Better this test works!!")

View File

@ -1,9 +1,9 @@
import unittest import unittest
import torch as T import torch as T
from layers.tacotron import Prenet, CBHG, Decoder, Encoder from TTS.layers.tacotron import Prenet, CBHG, Decoder, Encoder
from layers.losses import L1LossMasked from TTS.layers.losses import L1LossMasked
from utils.generic_utils import sequence_mask from TTS.utils.generic_utils import sequence_mask
#pylint: disable=unused-variable #pylint: disable=unused-variable

View File

@ -1,12 +1,14 @@
import os import os
import unittest import unittest
import shutil import shutil
import torch
import numpy as np
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from utils.generic_utils import load_config from TTS.utils.generic_utils import load_config
from utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
from datasets import TTSDataset from TTS.datasets import TTSDataset
from datasets.preprocess import ljspeech from TTS.datasets.preprocess import ljspeech
#pylint: disable=unused-variable #pylint: disable=unused-variable
@ -128,12 +130,16 @@ class TestTTSDataset(unittest.TestCase):
item_idx = data[7] item_idx = data[7]
# check mel_spec consistency # check mel_spec consistency
wav = self.ap.load_wav(item_idx[0]) wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32)
mel = self.ap.melspectrogram(wav) mel = self.ap.melspectrogram(wav).astype('float32')
mel_dl = mel_input[0].cpu().numpy() mel = torch.FloatTensor(mel).contiguous()
assert (abs(mel.T).astype("float32") mel_dl = mel_input[0]
- abs(mel_dl[:-1]) # NOTE: Below needs to check == 0 but due to an unknown reason
).sum() == 0 # there is a slight difference between two matrices.
# TODO: Check this assert cond more in detail.
assert abs((abs(mel.T)
- abs(mel_dl[:-1])
).sum()) < 1e-5, (abs(mel.T) - abs(mel_dl[:-1])).sum()
# check mel-spec correctness # check mel-spec correctness
mel_spec = mel_input[0].cpu().numpy() mel_spec = mel_input[0].cpu().numpy()

View File

@ -1,8 +1,8 @@
import unittest import unittest
import os import os
from tests import get_tests_input_path from TTS.tests import get_tests_input_path
from datasets.preprocess import common_voice from TTS.datasets.preprocess import common_voice
class TestPreprocessors(unittest.TestCase): class TestPreprocessors(unittest.TestCase):

View File

@ -6,9 +6,9 @@ import numpy as np
from torch import optim from torch import optim
from torch import nn from torch import nn
from utils.generic_utils import load_config from TTS.utils.generic_utils import load_config
from layers.losses import MSELossMasked from TTS.layers.losses import MSELossMasked
from models.tacotron2 import Tacotron2 from TTS.models.tacotron2 import Tacotron2
#pylint: disable=unused-variable #pylint: disable=unused-variable

View File

@ -5,9 +5,9 @@ import unittest
from torch import optim from torch import optim
from torch import nn from torch import nn
from utils.generic_utils import load_config from TTS.utils.generic_utils import load_config
from layers.losses import L1LossMasked from TTS.layers.losses import L1LossMasked
from models.tacotron import Tacotron from TTS.models.tacotron import Tacotron
#pylint: disable=unused-variable #pylint: disable=unused-variable

View File

@ -1,7 +1,7 @@
import unittest import unittest
import torch as T import torch as T
from utils.text import * from TTS.utils.text import *
def test_phoneme_to_sequence(): def test_phoneme_to_sequence():
text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"

140
train.py
View File

@ -10,24 +10,26 @@ import torch.nn as nn
from torch import optim from torch import optim
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from datasets.TTSDataset import MyDataset from TTS.datasets.TTSDataset import MyDataset
from distribute import (DistributedSampler, apply_gradient_allreduce, from distribute import (DistributedSampler, apply_gradient_allreduce,
init_distributed, reduce_tensor) init_distributed, reduce_tensor)
from layers.losses import L1LossMasked, MSELossMasked from TTS.layers.losses import L1LossMasked, MSELossMasked
from utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
from utils.generic_utils import (NoamLR, check_update, count_parameters, from TTS.utils.generic_utils import (NoamLR, check_update, count_parameters,
create_experiment_folder, get_git_branch, create_experiment_folder, get_git_branch,
load_config, remove_experiment_folder, load_config, remove_experiment_folder,
save_best_model, save_checkpoint, weight_decay, save_best_model, save_checkpoint, weight_decay,
set_init_dict, copy_config_file, setup_model, set_init_dict, copy_config_file, setup_model,
split_dataset) split_dataset, gradual_training_scheduler)
from utils.logger import Logger from TTS.utils.logger import Logger
from utils.speakers import load_speaker_mapping, save_speaker_mapping, \ from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \
get_speakers get_speakers
from utils.synthesis import synthesis from TTS.utils.synthesis import synthesis
from utils.text.symbols import phonemes, symbols from TTS.utils.text.symbols import phonemes, symbols
from utils.visual import plot_alignment, plot_spectrogram from TTS.utils.visual import plot_alignment, plot_spectrogram
from datasets.preprocess import get_preprocessor_by_name from TTS.datasets.preprocess import get_preprocessor_by_name
from TTS.utils.radam import RAdam
torch.backends.cudnn.enabled = True torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = False torch.backends.cudnn.benchmark = False
@ -82,7 +84,7 @@ def setup_loader(ap, is_val=False, verbose=False):
def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
ap, epoch): ap, global_step, epoch):
data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0))
if c.use_speaker_embedding: if c.use_speaker_embedding:
speaker_mapping = load_speaker_mapping(OUT_PATH) speaker_mapping = load_speaker_mapping(OUT_PATH)
@ -92,8 +94,13 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
avg_decoder_loss = 0 avg_decoder_loss = 0
avg_stop_loss = 0 avg_stop_loss = 0
avg_step_time = 0 avg_step_time = 0
avg_loader_time = 0
print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True) print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True)
batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) if use_cuda:
batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus))
else:
batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
end_time = time.time()
for num_iter, data in enumerate(data_loader): for num_iter, data in enumerate(data_loader):
start_time = time.time() start_time = time.time()
@ -107,6 +114,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
stop_targets = data[6] stop_targets = data[6]
avg_text_length = torch.mean(text_lengths.float()) avg_text_length = torch.mean(text_lengths.float())
avg_spec_length = torch.mean(mel_lengths.float()) avg_spec_length = torch.mean(mel_lengths.float())
loader_time = time.time() - end_time
if c.use_speaker_embedding: if c.use_speaker_embedding:
speaker_ids = [speaker_mapping[speaker_name] speaker_ids = [speaker_mapping[speaker_name]
@ -120,8 +128,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
stop_targets.size(1) // c.r, -1) stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2)
current_step = num_iter + args.restore_step + \ global_step += 1
epoch * len(data_loader) + 1
# setup lr # setup lr
if c.lr_decay: if c.lr_decay:
@ -176,18 +183,20 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
optimizer_st.step() optimizer_st.step()
else: else:
grad_norm_st = 0 grad_norm_st = 0
step_time = time.time() - start_time step_time = time.time() - start_time
epoch_time += step_time epoch_time += step_time
if current_step % c.print_step == 0: if global_step % c.print_step == 0:
print( print(
" | > Step:{}/{} GlobalStep:{} TotalLoss:{:.5f} PostnetLoss:{:.5f} " " | > Step:{}/{} GlobalStep:{} TotalLoss:{:.5f} PostnetLoss:{:.5f} "
"DecoderLoss:{:.5f} StopLoss:{:.5f} GradNorm:{:.5f} " "DecoderLoss:{:.5f} StopLoss:{:.5f} GradNorm:{:.5f} "
"GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} LR:{:.6f}".format( "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} "
num_iter, batch_n_iter, current_step, loss.item(), "LoaderTime:{:.2f} LR:{:.6f}".format(
num_iter, batch_n_iter, global_step, loss.item(),
postnet_loss.item(), decoder_loss.item(), stop_loss.item(), postnet_loss.item(), decoder_loss.item(), stop_loss.item(),
grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, current_lr), grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time,
loader_time, current_lr),
flush=True) flush=True)
# aggregate losses from processes # aggregate losses from processes
@ -202,21 +211,24 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
avg_decoder_loss += float(decoder_loss.item()) avg_decoder_loss += float(decoder_loss.item())
avg_stop_loss += stop_loss if isinstance(stop_loss, float) else float(stop_loss.item()) avg_stop_loss += stop_loss if isinstance(stop_loss, float) else float(stop_loss.item())
avg_step_time += step_time avg_step_time += step_time
avg_loader_time += loader_time
# Plot Training Iter Stats # Plot Training Iter Stats
iter_stats = {"loss_posnet": postnet_loss.item(), # reduce TB load
"loss_decoder": decoder_loss.item(), if global_step % 10 == 0:
"lr": current_lr, iter_stats = {"loss_posnet": postnet_loss.item(),
"grad_norm": grad_norm, "loss_decoder": decoder_loss.item(),
"grad_norm_st": grad_norm_st, "lr": current_lr,
"step_time": step_time} "grad_norm": grad_norm,
tb_logger.tb_train_iter_stats(current_step, iter_stats) "grad_norm_st": grad_norm_st,
"step_time": step_time}
tb_logger.tb_train_iter_stats(global_step, iter_stats)
if current_step % c.save_step == 0: if global_step % c.save_step == 0:
if c.checkpoint: if c.checkpoint:
# save model # save model
save_checkpoint(model, optimizer, optimizer_st, save_checkpoint(model, optimizer, optimizer_st,
postnet_loss.item(), OUT_PATH, current_step, postnet_loss.item(), OUT_PATH, global_step,
epoch) epoch)
# Diagnostic visualizations # Diagnostic visualizations
@ -229,31 +241,34 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
"ground_truth": plot_spectrogram(gt_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap),
"alignment": plot_alignment(align_img) "alignment": plot_alignment(align_img)
} }
tb_logger.tb_train_figures(current_step, figures) tb_logger.tb_train_figures(global_step, figures)
# Sample audio # Sample audio
if c.model in ["Tacotron", "TacotronGST"]: if c.model in ["Tacotron", "TacotronGST"]:
train_audio = ap.inv_spectrogram(const_spec.T) train_audio = ap.inv_spectrogram(const_spec.T)
else: else:
train_audio = ap.inv_mel_spectrogram(const_spec.T) train_audio = ap.inv_mel_spectrogram(const_spec.T)
tb_logger.tb_train_audios(current_step, tb_logger.tb_train_audios(global_step,
{'TrainAudio': train_audio}, {'TrainAudio': train_audio},
c.audio["sample_rate"]) c.audio["sample_rate"])
end_time = time.time()
avg_postnet_loss /= (num_iter + 1) avg_postnet_loss /= (num_iter + 1)
avg_decoder_loss /= (num_iter + 1) avg_decoder_loss /= (num_iter + 1)
avg_stop_loss /= (num_iter + 1) avg_stop_loss /= (num_iter + 1)
avg_total_loss = avg_decoder_loss + avg_postnet_loss + avg_stop_loss avg_total_loss = avg_decoder_loss + avg_postnet_loss + avg_stop_loss
avg_step_time /= (num_iter + 1) avg_step_time /= (num_iter + 1)
avg_loader_time /= (num_iter + 1)
# print epoch stats # print epoch stats
print( print(
" | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} " " | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} "
"AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} " "AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} "
"AvgStopLoss:{:.5f} EpochTime:{:.2f} " "AvgStopLoss:{:.5f} EpochTime:{:.2f} "
"AvgStepTime:{:.2f}".format(current_step, avg_total_loss, "AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format(global_step, avg_total_loss,
avg_postnet_loss, avg_decoder_loss, avg_postnet_loss, avg_decoder_loss,
avg_stop_loss, epoch_time, avg_step_time), avg_stop_loss, epoch_time, avg_step_time,
avg_loader_time),
flush=True) flush=True)
# Plot Epoch Stats # Plot Epoch Stats
@ -263,14 +278,13 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
"loss_decoder": avg_decoder_loss, "loss_decoder": avg_decoder_loss,
"stop_loss": avg_stop_loss, "stop_loss": avg_stop_loss,
"epoch_time": epoch_time} "epoch_time": epoch_time}
tb_logger.tb_train_epoch_stats(current_step, epoch_stats) tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
if c.tb_model_param_stats: if c.tb_model_param_stats:
tb_logger.tb_model_weights(model, current_step) tb_logger.tb_model_weights(model, global_step)
return avg_postnet_loss, global_step
return avg_postnet_loss, current_step
def evaluate(model, criterion, criterion_st, ap, current_step, epoch): def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
data_loader = setup_loader(ap, is_val=True) data_loader = setup_loader(ap, is_val=True)
if c.use_speaker_embedding: if c.use_speaker_embedding:
speaker_mapping = load_speaker_mapping(OUT_PATH) speaker_mapping = load_speaker_mapping(OUT_PATH)
@ -383,14 +397,14 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch):
"ground_truth": plot_spectrogram(gt_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap),
"alignment": plot_alignment(align_img) "alignment": plot_alignment(align_img)
} }
tb_logger.tb_eval_figures(current_step, eval_figures) tb_logger.tb_eval_figures(global_step, eval_figures)
# Sample audio # Sample audio
if c.model in ["Tacotron", "TacotronGST"]: if c.model in ["Tacotron", "TacotronGST"]:
eval_audio = ap.inv_spectrogram(const_spec.T) eval_audio = ap.inv_spectrogram(const_spec.T)
else: else:
eval_audio = ap.inv_mel_spectrogram(const_spec.T) eval_audio = ap.inv_mel_spectrogram(const_spec.T)
tb_logger.tb_eval_audios(current_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"])
# compute average losses # compute average losses
avg_postnet_loss /= (num_iter + 1) avg_postnet_loss /= (num_iter + 1)
@ -401,7 +415,7 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch):
epoch_stats = {"loss_postnet": avg_postnet_loss, epoch_stats = {"loss_postnet": avg_postnet_loss,
"loss_decoder": avg_decoder_loss, "loss_decoder": avg_decoder_loss,
"stop_loss": avg_stop_loss} "stop_loss": avg_stop_loss}
tb_logger.tb_eval_stats(current_step, epoch_stats) tb_logger.tb_eval_stats(global_step, epoch_stats)
if args.rank == 0 and epoch > c.test_delay_epochs: if args.rank == 0 and epoch > c.test_delay_epochs:
# test sentences # test sentences
@ -409,12 +423,14 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch):
test_figures = {} test_figures = {}
print(" | > Synthesizing test sentences") print(" | > Synthesizing test sentences")
speaker_id = 0 if c.use_speaker_embedding else None speaker_id = 0 if c.use_speaker_embedding else None
style_wav = c.get("style_wav_for_test")
for idx, test_sentence in enumerate(test_sentences): for idx, test_sentence in enumerate(test_sentences):
try: try:
wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis(
model, test_sentence, c, use_cuda, ap, model, test_sentence, c, use_cuda, ap,
speaker_id=speaker_id) speaker_id=speaker_id,
file_path = os.path.join(AUDIO_PATH, str(current_step)) style_wav=style_wav)
file_path = os.path.join(AUDIO_PATH, str(global_step))
os.makedirs(file_path, exist_ok=True) os.makedirs(file_path, exist_ok=True)
file_path = os.path.join(file_path, file_path = os.path.join(file_path,
"TestSentence_{}.wav".format(idx)) "TestSentence_{}.wav".format(idx))
@ -425,8 +441,8 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch):
except: except:
print(" !! Error creating Test Sentence -", idx) print(" !! Error creating Test Sentence -", idx)
traceback.print_exc() traceback.print_exc()
tb_logger.tb_test_audios(current_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_audios(global_step, test_audios, c.audio['sample_rate'])
tb_logger.tb_test_figures(current_step, test_figures) tb_logger.tb_test_figures(global_step, test_figures)
return avg_postnet_loss return avg_postnet_loss
@ -464,9 +480,9 @@ def main(args): #pylint: disable=redefined-outer-name
print(" | > Num output units : {}".format(ap.num_freq), flush=True) print(" | > Num output units : {}".format(ap.num_freq), flush=True)
optimizer = optim.Adam(model.parameters(), lr=c.lr, weight_decay=0) optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0)
if c.stopnet and c.separate_stopnet: if c.stopnet and c.separate_stopnet:
optimizer_st = optim.Adam( optimizer_st = RAdam(
model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0)
else: else:
optimizer_st = None optimizer_st = None
@ -524,11 +540,19 @@ def main(args): #pylint: disable=redefined-outer-name
if 'best_loss' not in locals(): if 'best_loss' not in locals():
best_loss = float('inf') best_loss = float('inf')
global_step = args.restore_step
for epoch in range(0, c.epochs): for epoch in range(0, c.epochs):
train_loss, current_step = train(model, criterion, criterion_st, # set gradual training
optimizer, optimizer_st, scheduler, if c.gradual_training is not None:
ap, epoch) r, c.batch_size = gradual_training_scheduler(global_step, c)
val_loss = evaluate(model, criterion, criterion_st, ap, current_step, epoch) c.r = r
model.decoder.set_r(r)
print(" > Number of outputs per iteration:", model.decoder.r)
train_loss, global_step = train(model, criterion, criterion_st,
optimizer, optimizer_st, scheduler,
ap, global_step, epoch)
val_loss = evaluate(model, criterion, criterion_st, ap, global_step, epoch)
print( print(
" | > Training Loss: {:.5f} Validation Loss: {:.5f}".format( " | > Training Loss: {:.5f} Validation Loss: {:.5f}".format(
train_loss, val_loss), train_loss, val_loss),
@ -537,7 +561,7 @@ def main(args): #pylint: disable=redefined-outer-name
if c.run_eval: if c.run_eval:
target_loss = val_loss target_loss = val_loss
best_loss = save_best_model(model, optimizer, target_loss, best_loss, best_loss = save_best_model(model, optimizer, target_loss, best_loss,
OUT_PATH, current_step, epoch) OUT_PATH, global_step, epoch)
if __name__ == '__main__': if __name__ == '__main__':
@ -571,7 +595,7 @@ if __name__ == '__main__':
'--output_folder', '--output_folder',
type=str, type=str,
default='', default='',
help='folder name for traning outputs.' help='folder name for training outputs.'
) )
# DISTRUBUTED # DISTRUBUTED

29
tts_namespace/README.md Normal file
View File

@ -0,0 +1,29 @@
This folder contains a symlink called TTS to the parent folder:
lrwxr-xr-x TTS -> ..
This is used to appease the distribute/setuptools gods. When the project was
initially set up, the repository folder itself was considered a namespace, and
development was done with `sys.path` hacks. This means if you tried to install
TTS, `setup.py` would see the packages `models`, `utils`, `layers`... instead of
`TTS.models`, `TTS.utils`...
Installing TTS would then pollute the package namespace with generic names like
those above. In order to make things installable in both install and development
modes (`pip install /path/to/TTS` and `pip install -e /path/to/TTS`), we needed
to add an additional 'TTS' namespace to avoid this pollution. A virtual redirect
using `packages_dir` in `setup.py` is not enough because it breaks the editable
installation, which can only handle the simplest of `package_dir` redirects.
Our solution is to use a symlink in order to add the extra `TTS` namespace. In
`setup.py`, we only look for packages inside `tts_namespace` (this folder),
which contains a symlink called TTS pointing to the repository root. The final
result is that `setuptools.find_packages` will find `TTS.models`, `TTS.utils`...
With this hack, `pip install -e` will then add a symlink to the `tts_namespace`
in your `site-packages` folder, which works properly. It's important not to add
anything else in this folder because it will pollute the package namespace when
installing the project.
This does not work if you check out your project on a filesystem that does not
support symlinks.

1
tts_namespace/TTS Symbolic link
View File

@ -0,0 +1 @@
..

View File

@ -113,8 +113,10 @@ class AudioProcessor(object):
def _stft_parameters(self, ): def _stft_parameters(self, ):
"""Compute necessary stft parameters with given time values""" """Compute necessary stft parameters with given time values"""
n_fft = (self.num_freq - 1) * 2 n_fft = (self.num_freq - 1) * 2
factor = self.frame_length_ms / self.frame_shift_ms
assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms"
hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
win_length = int(self.frame_length_ms / 1000.0 * self.sample_rate) win_length = int(hop_length * factor)
return n_fft, hop_length, win_length return n_fft, hop_length, win_length
def _amp_to_db(self, x): def _amp_to_db(self, x):

View File

@ -121,7 +121,8 @@ def save_checkpoint(model, optimizer, optimizer_st, model_loss, out_path,
'step': current_step, 'step': current_step,
'epoch': epoch, 'epoch': epoch,
'linear_loss': model_loss, 'linear_loss': model_loss,
'date': datetime.date.today().strftime("%B %d, %Y") 'date': datetime.date.today().strftime("%B %d, %Y"),
'r': model.decoder.r
} }
torch.save(state, checkpoint_path) torch.save(state, checkpoint_path)
@ -136,7 +137,8 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path,
'step': current_step, 'step': current_step,
'epoch': epoch, 'epoch': epoch,
'linear_loss': model_loss, 'linear_loss': model_loss,
'date': datetime.date.today().strftime("%B %d, %Y") 'date': datetime.date.today().strftime("%B %d, %Y"),
'r': model.decoder.r
} }
best_loss = model_loss best_loss = model_loss
bestmodel_path = 'best_model.pth.tar' bestmodel_path = 'best_model.pth.tar'
@ -248,7 +250,7 @@ def set_init_dict(model_dict, checkpoint, c):
def setup_model(num_chars, num_speakers, c): def setup_model(num_chars, num_speakers, c):
print(" > Using model: {}".format(c.model)) print(" > Using model: {}".format(c.model))
MyModel = importlib.import_module('models.' + c.model.lower()) MyModel = importlib.import_module('TTS.models.' + c.model.lower())
MyModel = getattr(MyModel, c.model) MyModel = getattr(MyModel, c.model)
if c.model.lower() in ["tacotron", "tacotrongst"]: if c.model.lower() in ["tacotron", "tacotrongst"]:
model = MyModel( model = MyModel(
@ -305,3 +307,10 @@ def split_dataset(items):
else: else:
return items[:eval_split_size], items[eval_split_size:] return items[:eval_split_size], items[eval_split_size:]
def gradual_training_scheduler(global_step, config):
new_values = None
for values in config.gradual_training:
if global_step >= values[0]:
new_values = values
return new_values[1], new_values[2]

154
utils/radam.py Normal file
View File

@ -0,0 +1,154 @@
import math
import torch
from torch.optim.optimizer import Optimizer
# adapted from https://github.com/LiyuanLucasLiu/RAdam
class RAdam(Optimizer):
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
self.buffer = [[None, None, None] for ind in range(10)]
super(RAdam, self).__init__(params, defaults)
def __setstate__(self, state): # pylint: disable= useless-super-delegation
super(RAdam, self).__setstate__(state)
def step(self, closure=None):
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data.float()
if grad.is_sparse:
raise RuntimeError(
'RAdam does not support sparse gradients')
p_data_fp32 = p.data.float()
state = self.state[p]
if not state:
state['step'] = 0
state['exp_avg'] = torch.zeros_like(p_data_fp32)
state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
else:
state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
state['exp_avg_sq'] = state['exp_avg_sq'].type_as(
p_data_fp32)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
exp_avg.mul_(beta1).add_(1 - beta1, grad)
state['step'] += 1
buffered = self.buffer[int(state['step'] % 10)]
if state['step'] == buffered[0]:
N_sma, step_size = buffered[1], buffered[2]
else:
buffered[0] = state['step']
beta2_t = beta2 ** state['step']
N_sma_max = 2 / (1 - beta2) - 1
N_sma = N_sma_max - 2 * \
state['step'] * beta2_t / (1 - beta2_t)
buffered[1] = N_sma
# more conservative since it's an approximated value
if N_sma >= 5:
step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (
N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
else:
step_size = group['lr'] / (1 - beta1 ** state['step'])
buffered[2] = step_size
if group['weight_decay'] != 0:
p_data_fp32.add_(-group['weight_decay']
* group['lr'], p_data_fp32)
# more conservative since it's an approximated value
if N_sma >= 5:
denom = exp_avg_sq.sqrt().add_(group['eps'])
p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
else:
p_data_fp32.add_(-step_size, exp_avg)
p.data.copy_(p_data_fp32)
return loss
class PlainRAdam(Optimizer):
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
super(PlainRAdam, self).__init__(params, defaults)
def __setstate__(self, state): # pylint: disable= useless-super-delegation
super(PlainRAdam, self).__setstate__(state)
def step(self, closure=None):
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data.float()
if grad.is_sparse:
raise RuntimeError(
'RAdam does not support sparse gradients')
p_data_fp32 = p.data.float()
state = self.state[p]
if not state:
state['step'] = 0
state['exp_avg'] = torch.zeros_like(p_data_fp32)
state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
else:
state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
state['exp_avg_sq'] = state['exp_avg_sq'].type_as(
p_data_fp32)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
exp_avg.mul_(beta1).add_(1 - beta1, grad)
state['step'] += 1
beta2_t = beta2 ** state['step']
N_sma_max = 2 / (1 - beta2) - 1
N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
if group['weight_decay'] != 0:
p_data_fp32.add_(-group['weight_decay']
* group['lr'], p_data_fp32)
# more conservative since it's an approximated value
if N_sma >= 5:
step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (
N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
denom = exp_avg_sq.sqrt().add_(group['eps'])
p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
else:
step_size = group['lr'] / (1 - beta1 ** state['step'])
p_data_fp32.add_(-step_size, exp_avg)
p.data.copy_(p_data_fp32)
return loss

View File

@ -1,7 +1,7 @@
import os import os
import json import json
from datasets.preprocess import get_preprocessor_by_name from TTS.datasets.preprocess import get_preprocessor_by_name
def make_speakers_json_path(out_path): def make_speakers_json_path(out_path):

View File

@ -50,7 +50,7 @@ def parse_outputs(postnet_output, decoder_output, alignments):
return postnet_output, decoder_output, alignment return postnet_output, decoder_output, alignment
def trim_silence(wav): def trim_silence(wav, ap):
return wav[:ap.find_endpoint(wav)] return wav[:ap.find_endpoint(wav)]
@ -114,5 +114,5 @@ def synthesis(model,
wav = inv_spectrogram(postnet_output, ap, CONFIG) wav = inv_spectrogram(postnet_output, ap, CONFIG)
# trim silence # trim silence
if do_trim_silence: if do_trim_silence:
wav = trim_silence(wav) wav = trim_silence(wav, ap)
return wav, alignment, decoder_output, postnet_output, stop_tokens return wav, alignment, decoder_output, postnet_output, stop_tokens

View File

@ -3,8 +3,8 @@
import re import re
import phonemizer import phonemizer
from phonemizer.phonemize import phonemize from phonemizer.phonemize import phonemize
from utils.text import cleaners from TTS.utils.text import cleaners
from utils.text.symbols import symbols, phonemes, _phoneme_punctuations, _bos, \ from TTS.utils.text.symbols import symbols, phonemes, _phoneme_punctuations, _bos, \
_eos _eos
# Mappings from symbol to numeric ID and vice versa: # Mappings from symbol to numeric ID and vice versa:
@ -17,7 +17,7 @@ _ID_TO_PHONEMES = {i: s for i, s in enumerate(phonemes)}
# Regular expression matching text enclosed in curly braces: # Regular expression matching text enclosed in curly braces:
_CURLY_RE = re.compile(r'(.*?)\{(.+?)\}(.*)') _CURLY_RE = re.compile(r'(.*?)\{(.+?)\}(.*)')
# Regular expression matchinf punctuations, ignoring empty space # Regular expression matching punctuations, ignoring empty space
PHONEME_PUNCTUATION_PATTERN = r'['+_phoneme_punctuations+']+' PHONEME_PUNCTUATION_PATTERN = r'['+_phoneme_punctuations+']+'
@ -47,7 +47,7 @@ def text2phone(text, language):
def pad_with_eos_bos(phoneme_sequence): def pad_with_eos_bos(phoneme_sequence):
return [_PHONEMES_TO_ID[_bos]] + phoneme_sequence + [_PHONEMES_TO_ID[_eos]] return [_PHONEMES_TO_ID[_bos]] + list(phoneme_sequence) + [_PHONEMES_TO_ID[_eos]]
def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False): def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False):

View File

@ -18,7 +18,7 @@ _vowels = 'iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ'
_non_pulmonic_consonants = 'ʘɓǀɗǃʄǂɠǁʛ' _non_pulmonic_consonants = 'ʘɓǀɗǃʄǂɠǁʛ'
_pulmonic_consonants = 'pbtdʈɖcɟkɡʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ' _pulmonic_consonants = 'pbtdʈɖcɟkɡʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ'
_suprasegmentals = 'ˈˌːˑ' _suprasegmentals = 'ˈˌːˑ'
_other_symbols = 'ʍwɥʜʢʡɕʑɺɧ ' _other_symbols = 'ʍwɥʜʢʡɕʑɺɧ'
_diacrilics = 'ɚ˞ɫ' _diacrilics = 'ɚ˞ɫ'
_phonemes = sorted(list(_vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics)) _phonemes = sorted(list(_vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics))

View File

@ -1,14 +1,19 @@
import torch
import librosa import librosa
import matplotlib import matplotlib
matplotlib.use('Agg') matplotlib.use('Agg')
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from utils.text import phoneme_to_sequence, sequence_to_phoneme from TTS.utils.text import phoneme_to_sequence, sequence_to_phoneme
def plot_alignment(alignment, info=None): def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None):
fig, ax = plt.subplots(figsize=(16, 10)) if isinstance(alignment, torch.Tensor):
alignment_ = alignment.detach().cpu().numpy().squeeze()
else:
alignment_ = alignment
fig, ax = plt.subplots(figsize=fig_size)
im = ax.imshow( im = ax.imshow(
alignment.T, aspect='auto', origin='lower', interpolation='none') alignment_.T, aspect='auto', origin='lower', interpolation='none')
fig.colorbar(im, ax=ax) fig.colorbar(im, ax=ax)
xlabel = 'Decoder timestep' xlabel = 'Decoder timestep'
if info is not None: if info is not None:
@ -17,12 +22,18 @@ def plot_alignment(alignment, info=None):
plt.ylabel('Encoder timestep') plt.ylabel('Encoder timestep')
# plt.yticks(range(len(text)), list(text)) # plt.yticks(range(len(text)), list(text))
plt.tight_layout() plt.tight_layout()
if title is not None:
plt.title(title)
return fig return fig
def plot_spectrogram(linear_output, audio): def plot_spectrogram(linear_output, audio, fig_size=(16, 10)):
spectrogram = audio._denormalize(linear_output) if isinstance(linear_output, torch.Tensor):
fig = plt.figure(figsize=(16, 10)) linear_output_ = linear_output.detach().cpu().numpy().squeeze()
else:
linear_output_ = linear_output
spectrogram = audio._denormalize(linear_output_) # pylint: disable=protected-access
fig = plt.figure(figsize=fig_size)
plt.imshow(spectrogram.T, aspect="auto", origin="lower") plt.imshow(spectrogram.T, aspect="auto", origin="lower")
plt.colorbar() plt.colorbar()
plt.tight_layout() plt.tight_layout()