From 5482a0f62d3da9821b4689f4e9268580904a081f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 19 May 2021 14:00:44 +0200
Subject: [PATCH 01/36] type def for gradual_training

---
 TTS/tts/configs/tacotron_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py
index d3a54269..a567cd88 100644
--- a/TTS/tts/configs/tacotron_config.py
+++ b/TTS/tts/configs/tacotron_config.py
@@ -122,7 +122,7 @@ class TacotronConfig(BaseTTSConfig):
     gst_style_input: str = None
     # model specific params
     r: int = 2
-    gradual_training: List[List] = None
+    gradual_training: List[List[int]] = None
     memory_size: int = -1
     prenet_type: str = "original"
     prenet_dropout: bool = True

From b8f50d3d86f77aeba453ff6deabaf447215a5331 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 21 May 2021 00:30:39 +0200
Subject: [PATCH 02/36] replace unidecode with anyascii

---
 TTS/tts/utils/text/cleaners.py              | 16 ++--------------
 requirements.txt                            |  2 +-
 tests/tts_tests/test_glow_tts_train.py      |  2 +-
 tests/tts_tests/test_speedy_speech_train.py |  2 +-
 tests/vocoder_tests/test_melgan_train.py    |  1 +
 5 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py
index 2eddcdb8..3d2caa97 100644
--- a/TTS/tts/utils/text/cleaners.py
+++ b/TTS/tts/utils/text/cleaners.py
@@ -1,18 +1,6 @@
-"""
-Cleaners are transformations that run over the input text at both training and eval time.
-
-Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
-hyperparameter. Some cleaners are English-specific. You'll typically want to use:
-  1. "english_cleaners" for English text
-  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
-     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
-  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
-     the symbols in symbols.py to match your data).
-"""
-
 import re
 
-from unidecode import unidecode
+from anyascii import anyascii
 
 from TTS.tts.utils.text.chinese_mandarin.numbers import replace_numbers_to_characters_in_text
 
@@ -47,7 +35,7 @@ def collapse_whitespace(text):
 
 
 def convert_to_ascii(text):
-    return unidecode(text)
+    return anyascii(text)
 
 
 def remove_aux_symbols(text):
diff --git a/requirements.txt b/requirements.txt
index c6ce7672..b376eb1b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,5 +17,5 @@ torch>=1.7
 tqdm
 numba==0.52
 umap-learn==0.4.6
-unidecode==0.4.20
+anyascii
 coqpit
diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py
index 00c7e852..2e675d13 100644
--- a/tests/tts_tests/test_glow_tts_train.py
+++ b/tests/tts_tests/test_glow_tts_train.py
@@ -17,7 +17,7 @@ config = GlowTTSConfig(
     text_cleaner="english_cleaners",
     use_phonemes=True,
     phoneme_language="zh-CN",
-    phoneme_cache_path='tests/data/ljspeech/phoneme_cache/',
+    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
     run_eval=True,
     test_delay_epochs=-1,
     epochs=1,
diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py
index cc2845c2..3f508117 100644
--- a/tests/tts_tests/test_speedy_speech_train.py
+++ b/tests/tts_tests/test_speedy_speech_train.py
@@ -17,7 +17,7 @@ config = SpeedySpeechConfig(
     text_cleaner="english_cleaners",
     use_phonemes=True,
     phoneme_language="zh-CN",
-    phoneme_cache_path='tests/data/ljspeech/phoneme_cache/',
+    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
     run_eval=True,
     test_delay_epochs=-1,
     epochs=1,
diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py
index de48ca24..3ff65b5a 100644
--- a/tests/vocoder_tests/test_melgan_train.py
+++ b/tests/vocoder_tests/test_melgan_train.py
@@ -19,6 +19,7 @@ config = MelganConfig(
     seq_len=2048,
     eval_split_size=1,
     print_step=1,
+    discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]},
     print_eval=True,
     data_path="tests/data/ljspeech",
     output_path=output_path,

From 0536aa6d0f41b125dd96811a8a5b04fac70d6652 Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Sat, 22 May 2021 17:12:19 +0900
Subject: [PATCH 03/36] Japanese Tacotron 2 model

---
 TTS/tts/configs/kokoro_tacotron2.json | 173 ++++++++++++
 TTS/tts/datasets/preprocess.py        |  14 +
 TTS/tts/utils/japanese/__init__.py    |   1 +
 TTS/tts/utils/japanese/text.py        | 380 ++++++++++++++++++++++++++
 TTS/tts/utils/japanese/text_test.py   |  22 ++
 TTS/tts/utils/text/__init__.py        |   5 +
 requirements.txt                      |   2 +
 7 files changed, 597 insertions(+)
 create mode 100644 TTS/tts/configs/kokoro_tacotron2.json
 create mode 100644 TTS/tts/utils/japanese/__init__.py
 create mode 100644 TTS/tts/utils/japanese/text.py
 create mode 100644 TTS/tts/utils/japanese/text_test.py

diff --git a/TTS/tts/configs/kokoro_tacotron2.json b/TTS/tts/configs/kokoro_tacotron2.json
new file mode 100644
index 00000000..f5d41194
--- /dev/null
+++ b/TTS/tts/configs/kokoro_tacotron2.json
@@ -0,0 +1,173 @@
+{
+    "model": "Tacotron2",
+    "run_name": "kokoro-ddc",
+    "run_description": "tacotron2 with DDC and differential spectral loss.",
+
+    // AUDIO PARAMETERS
+    "audio":{
+        // stft parameters
+        "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
+        "win_length": 1024,      // stft window length in ms.
+        "hop_length": 256,       // stft window hop-lengh in ms.
+        "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
+
+        // Audio processing parameters
+        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate.
+        "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+
+        // Silence trimming
+        "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+
+        // Griffin-Lim
+        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
+        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+
+        // MelSpectrogram parameters
+        "num_mels": 80,         // size of the mel spec frame.
+        "mel_fmin": 50.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 7600.0,     // maximum freq level for mel-spec. Tune for dataset!!
+        "spec_gain": 1,
+
+        // Normalization parameters
+        "signal_norm": true,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
+        "min_level_db": -100,   // lower bound for normalization
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "stats_path": "./scale_stats.npy"    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+    },
+
+    // VOCABULARY PARAMETERS
+    // if custom character set is not defined,
+    // default set in symbols.py is used
+    "characters":{
+        "pad": "_",
+        "eos": "~",
+        "bos": "^",
+        "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
+        "punctuations": "!'(),-.:;? ",
+        "phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+    },
+
+    // DISTRIBUTED TRAINING
+    "distributed":{
+        "backend": "nccl",
+        "url": "tcp:\/\/localhost:54321"
+    },
+
+    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
+
+    // TRAINING
+    "batch_size": 32,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "eval_batch_size":16,
+    "r": 7,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
+    "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
+    "mixed_precision": true,     // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate.
+
+    // LOSS SETTINGS
+    "loss_masking": true,       // enable / disable loss masking against the sequence padding.
+    "decoder_loss_alpha": 0.5,  // original decoder loss weight. If > 0, it is enabled
+    "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled
+    "postnet_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "decoder_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "decoder_ssim_alpha": 0.5,     // decoder ssim loss weight. If > 0, it is enabled
+    "postnet_ssim_alpha": 0.25,     // postnet ssim loss weight. If > 0, it is enabled
+    "ga_alpha": 5.0,           // weight for guided attention loss. If > 0, guided attention is enabled.
+    "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples.
+
+
+    // VALIDATION
+    "run_eval": true,
+    "test_delay_epochs": 10,  //Until attention is aligned, testing only wastes computation time.
+    "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
+
+    // OPTIMIZER
+    "noam_schedule": false,        // use noam warmup and lr schedule.
+    "grad_clip": 1.0,              // upper limit for gradients for clipping.
+    "epochs": 1000,                // total number of epochs to train.
+    "lr": 0.0001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "wd": 0.000001,                // Weight decay weight.
+    "warmup_steps": 4000,          // Noam decay steps to increase the learning rate from 0 to "lr"
+    "seq_len_norm": false,         // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
+
+    // TACOTRON PRENET
+    "memory_size": -1,             // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
+    "prenet_type": "original",     // "original" or "bn".
+    "prenet_dropout": true,       // enable/disable dropout at prenet.
+
+    // TACOTRON ATTENTION
+    "attention_type": "original",  // 'original' , 'graves', 'dynamic_convolution'
+    "attention_heads": 4,          // number of attention heads (only for 'graves')
+    "attention_norm": "sigmoid",   // softmax or sigmoid.
+    "windowing": false,            // Enables attention windowing. Used only in eval mode.
+    "use_forward_attn": false,     // if it uses forward attention. In general, it aligns faster.
+    "forward_attn_mask": false,    // Additional masking forcing monotonicity only in eval mode.
+    "transition_agent": false,     // enable/disable transition agent of forward attention.
+    "location_attn": true,         // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+    "bidirectional_decoder": false,  // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
+    "double_decoder_consistency": true,  // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
+    "ddc_r": 7,                           // reduction rate for coarse decoder.
+
+    // STOPNET
+    "stopnet": true,               // Train stopnet predicting the end of synthesis.
+    "separate_stopnet": true,      // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
+
+    // TENSORBOARD and LOGGING
+    "print_step": 25,       // Number of steps to log training on console.
+    "tb_plot_step": 100,    // Number of steps to plot TB training figures.
+    "print_eval": false,     // If True, it prints intermediate loss values in evalulation.
+    "save_step": 10000,      // Number of training steps expected to save traninpg stats and checkpoints.
+    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
+    "keep_all_best": false,  // If true, keeps all best_models after keep_after steps
+    "keep_after": 10000,    // Global step after which to keep best models if keep_all_best is true
+    "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+
+    // DATA LOADING
+    "text_cleaner": "basic_cleaners",
+    "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
+    "num_loader_workers": 4,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "num_val_loader_workers": 4,    // number of evaluation data loader processes.
+    "batch_group_size": 4,  //Number of batches to shuffle after bucketing.
+    "min_seq_len": 6,       // DATASET-RELATED: minimum text length to use in training
+    "max_seq_len": 153,     // DATASET-RELATED: maximum text length
+    "compute_input_seq_cache": false,  // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
+    "use_noise_augment": true,
+
+    // PATHS
+    "output_path": "./Models/Kokoro/",
+
+    // PHONEMES
+    "phoneme_cache_path": "./phoneme_cache/",  // phoneme computation is slow, therefore, it caches results in the given folder.
+    "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
+    "phoneme_language": "ja-jp",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
+
+    // MULTI-SPEAKER and GST
+    "use_speaker_embedding": false,      // use speaker embedding to enable multi-speaker learning.
+    "use_gst": false,       			    // use global style tokens
+    "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
+    "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
+    "gst":	{			                // gst parameter if gst is enabled
+        "gst_style_input": null,        // Condition the style input either on a
+                                        // -> wave file [path to wave] or
+                                        // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
+                                        // with the dictionary being len(dict) <= len(gst_style_tokens).
+        "gst_embedding_dim": 512,
+        "gst_num_heads": 4,
+        "gst_style_tokens": 10,
+        "gst_use_speaker_embedding": false
+	},
+
+    // DATASETS
+    "datasets":   // List of datasets. They all merged and they get different speaker_ids.
+        [
+            {
+                "name": "kokoro",
+                "path": "./kokoro-speech-v1_1-small/",
+                "meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers
+                "meta_file_val": null
+            }
+        ]
+}
diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py
index 72ab160e..271b1734 100644
--- a/TTS/tts/datasets/preprocess.py
+++ b/TTS/tts/datasets/preprocess.py
@@ -424,3 +424,17 @@ def baker(root_path: str, meta_file: str) -> List[List[str]]:
             wav_path = os.path.join(root_path, "clips_22", wav_name)
             items.append([text, wav_path, speaker_name])
     return items
+
+
+def kokoro(root_path, meta_file):
+    """Japanese single-speaker dataset from https://github.com/kaiidams/Kokoro-Speech-Dataset"""
+    txt_file = os.path.join(root_path, meta_file)
+    items = []
+    speaker_name = "kokoro"
+    with open(txt_file, "r") as ttf:
+        for line in ttf:
+            cols = line.split("|")
+            wav_file = os.path.join(root_path, "wavs", cols[0] + '.wav')
+            text = cols[2].replace(" ", "")
+            items.append([text, wav_file, speaker_name])
+    return items
diff --git a/TTS/tts/utils/japanese/__init__.py b/TTS/tts/utils/japanese/__init__.py
new file mode 100644
index 00000000..0ce7a99d
--- /dev/null
+++ b/TTS/tts/utils/japanese/__init__.py
@@ -0,0 +1 @@
+from .text import japanese_text2phone
\ No newline at end of file
diff --git a/TTS/tts/utils/japanese/text.py b/TTS/tts/utils/japanese/text.py
new file mode 100644
index 00000000..4c8936ac
--- /dev/null
+++ b/TTS/tts/utils/japanese/text.py
@@ -0,0 +1,380 @@
+# Convert Japanese text to phonemes which is
+# compatible with Julius https://github.com/julius-speech/segmentation-kit
+
+import re
+import MeCab
+from typing import List, Tuple
+
+_CONVRULES = [
+    # Conversion of 2 letters
+    'アァ/ a a',
+    'イィ/ i i',
+    'イェ/ i e',
+    'イャ/ y a',
+    'ウゥ/ u:',
+    'エェ/ e e',
+    'オォ/ o:',
+    'カァ/ k a:',
+    'キィ/ k i:',
+    'クゥ/ k u:',
+    'クャ/ ky a',
+    'クュ/ ky u',
+    'クョ/ ky o',
+    'ケェ/ k e:',
+    'コォ/ k o:',
+    'ガァ/ g a:',
+    'ギィ/ g i:',
+    'グゥ/ g u:',
+    'グャ/ gy a',
+    'グュ/ gy u',
+    'グョ/ gy o',
+    'ゲェ/ g e:',
+    'ゴォ/ g o:',
+    'サァ/ s a:',
+    'シィ/ sh i:',
+    'スゥ/ s u:',
+    'スャ/ sh a',
+    'スュ/ sh u',
+    'スョ/ sh o',
+    'セェ/ s e:',
+    'ソォ/ s o:',
+    'ザァ/ z a:',
+    'ジィ/ j i:',
+    'ズゥ/ z u:',
+    'ズャ/ zy a',
+    'ズュ/ zy u',
+    'ズョ/ zy o',
+    'ゼェ/ z e:',
+    'ゾォ/ z o:',
+    'タァ/ t a:',
+    'チィ/ ch i:',
+    'ツァ/ ts a',
+    'ツィ/ ts i',
+    'ツゥ/ ts u:',
+    'ツャ/ ch a',
+    'ツュ/ ch u',
+    'ツョ/ ch o',
+    'ツェ/ ts e',
+    'ツォ/ ts o',
+    'テェ/ t e:',
+    'トォ/ t o:',
+    'ダァ/ d a:',
+    'ヂィ/ j i:',
+    'ヅゥ/ d u:',
+    'ヅャ/ zy a',
+    'ヅュ/ zy u',
+    'ヅョ/ zy o',
+    'デェ/ d e:',
+    'ドォ/ d o:',
+    'ナァ/ n a:',
+    'ニィ/ n i:',
+    'ヌゥ/ n u:',
+    'ヌャ/ ny a',
+    'ヌュ/ ny u',
+    'ヌョ/ ny o',
+    'ネェ/ n e:',
+    'ノォ/ n o:',
+    'ハァ/ h a:',
+    'ヒィ/ h i:',
+    'フゥ/ f u:',
+    'フャ/ hy a',
+    'フュ/ hy u',
+    'フョ/ hy o',
+    'ヘェ/ h e:',
+    'ホォ/ h o:',
+    'バァ/ b a:',
+    'ビィ/ b i:',
+    'ブゥ/ b u:',
+    'フャ/ hy a',
+    'ブュ/ by u',
+    'フョ/ hy o',
+    'ベェ/ b e:',
+    'ボォ/ b o:',
+    'パァ/ p a:',
+    'ピィ/ p i:',
+    'プゥ/ p u:',
+    'プャ/ py a',
+    'プュ/ py u',
+    'プョ/ py o',
+    'ペェ/ p e:',
+    'ポォ/ p o:',
+    'マァ/ m a:',
+    'ミィ/ m i:',
+    'ムゥ/ m u:',
+    'ムャ/ my a',
+    'ムュ/ my u',
+    'ムョ/ my o',
+    'メェ/ m e:',
+    'モォ/ m o:',
+    'ヤァ/ y a:',
+    'ユゥ/ y u:',
+    'ユャ/ y a:',
+    'ユュ/ y u:',
+    'ユョ/ y o:',
+    'ヨォ/ y o:',
+    'ラァ/ r a:',
+    'リィ/ r i:',
+    'ルゥ/ r u:',
+    'ルャ/ ry a',
+    'ルュ/ ry u',
+    'ルョ/ ry o',
+    'レェ/ r e:',
+    'ロォ/ r o:',
+    'ワァ/ w a:',
+    'ヲォ/ o:',
+    'ディ/ d i',
+    'デェ/ d e:',
+    'デャ/ dy a',
+    'デュ/ dy u',
+    'デョ/ dy o',
+    'ティ/ t i',
+    'テェ/ t e:',
+    'テャ/ ty a',
+    'テュ/ ty u',
+    'テョ/ ty o',
+    'スィ/ s i',
+    'ズァ/ z u a',
+    'ズィ/ z i',
+    'ズゥ/ z u',
+    'ズャ/ zy a',
+    'ズュ/ zy u',
+    'ズョ/ zy o',
+    'ズェ/ z e',
+    'ズォ/ z o',
+    'キャ/ ky a',
+    'キュ/ ky u',
+    'キョ/ ky o',
+    'シャ/ sh a',
+    'シュ/ sh u',
+    'シェ/ sh e',
+    'ショ/ sh o',
+    'チャ/ ch a',
+    'チュ/ ch u',
+    'チェ/ ch e',
+    'チョ/ ch o',
+    'トゥ/ t u',
+    'トャ/ ty a',
+    'トュ/ ty u',
+    'トョ/ ty o',
+    'ドァ/ d o a',
+    'ドゥ/ d u',
+    'ドャ/ dy a',
+    'ドュ/ dy u',
+    'ドョ/ dy o',
+    'ドォ/ d o:',
+    'ニャ/ ny a',
+    'ニュ/ ny u',
+    'ニョ/ ny o',
+    'ヒャ/ hy a',
+    'ヒュ/ hy u',
+    'ヒョ/ hy o',
+    'ミャ/ my a',
+    'ミュ/ my u',
+    'ミョ/ my o',
+    'リャ/ ry a',
+    'リュ/ ry u',
+    'リョ/ ry o',
+    'ギャ/ gy a',
+    'ギュ/ gy u',
+    'ギョ/ gy o',
+    'ヂェ/ j e',
+    'ヂャ/ j a',
+    'ヂュ/ j u',
+    'ヂョ/ j o',
+    'ジェ/ j e',
+    'ジャ/ j a',
+    'ジュ/ j u',
+    'ジョ/ j o',
+    'ビャ/ by a',
+    'ビュ/ by u',
+    'ビョ/ by o',
+    'ピャ/ py a',
+    'ピュ/ py u',
+    'ピョ/ py o',
+    'ウァ/ u a',
+    'ウィ/ w i',
+    'ウェ/ w e',
+    'ウォ/ w o',
+    'ファ/ f a',
+    'フィ/ f i',
+    'フゥ/ f u',
+    'フャ/ hy a',
+    'フュ/ hy u',
+    'フョ/ hy o',
+    'フェ/ f e',
+    'フォ/ f o',
+    'ヴァ/ b a',
+    'ヴィ/ b i',
+    'ヴェ/ b e',
+    'ヴォ/ b o',
+    'ヴュ/ by u',
+
+    # Conversion of 1 letter
+    'ア/ a',
+    'イ/ i',
+    'ウ/ u',
+    'エ/ e',
+    'オ/ o',
+    'カ/ k a',
+    'キ/ k i',
+    'ク/ k u',
+    'ケ/ k e',
+    'コ/ k o',
+    'サ/ s a',
+    'シ/ sh i',
+    'ス/ s u',
+    'セ/ s e',
+    'ソ/ s o',
+    'タ/ t a',
+    'チ/ ch i',
+    'ツ/ ts u',
+    'テ/ t e',
+    'ト/ t o',
+    'ナ/ n a',
+    'ニ/ n i',
+    'ヌ/ n u',
+    'ネ/ n e',
+    'ノ/ n o',
+    'ハ/ h a',
+    'ヒ/ h i',
+    'フ/ f u',
+    'ヘ/ h e',
+    'ホ/ h o',
+    'マ/ m a',
+    'ミ/ m i',
+    'ム/ m u',
+    'メ/ m e',
+    'モ/ m o',
+    'ラ/ r a',
+    'リ/ r i',
+    'ル/ r u',
+    'レ/ r e',
+    'ロ/ r o',
+    'ガ/ g a',
+    'ギ/ g i',
+    'グ/ g u',
+    'ゲ/ g e',
+    'ゴ/ g o',
+    'ザ/ z a',
+    'ジ/ j i',
+    'ズ/ z u',
+    'ゼ/ z e',
+    'ゾ/ z o',
+    'ダ/ d a',
+    'ヂ/ j i',
+    'ヅ/ z u',
+    'デ/ d e',
+    'ド/ d o',
+    'バ/ b a',
+    'ビ/ b i',
+    'ブ/ b u',
+    'ベ/ b e',
+    'ボ/ b o',
+    'パ/ p a',
+    'ピ/ p i',
+    'プ/ p u',
+    'ペ/ p e',
+    'ポ/ p o',
+    'ヤ/ y a',
+    'ユ/ y u',
+    'ヨ/ y o',
+    'ワ/ w a',
+    'ヰ/ i',
+    'ヱ/ e',
+    'ヲ/ o',
+    'ン/ N',
+    'ッ/ q',
+    'ヴ/ b u',
+    'ー/:',
+
+    # Try converting broken text
+    'ァ/ a',
+    'ィ/ i',
+    'ゥ/ u',
+    'ェ/ e',
+    'ォ/ o',
+    'ヮ/ w a',
+    'ォ/ o',
+
+    # Symbols
+    '、/ ,',
+    '。/ .',
+    '！/ !',
+    '？/ ?',
+    '・/ ,'
+]
+
+_COLON_RX = re.compile(':+')
+_REJECT_RX = re.compile('[^ a-zA-Z:,.?]')
+
+def _makerulemap():
+    l = [tuple(x.split('/')) for x in _CONVRULES]
+    return tuple(
+        {k: v for k, v in l if len(k) == i}
+        for i in (1, 2)
+    )
+
+_RULEMAP1, _RULEMAP2 = _makerulemap()
+
+def kata2phoneme(text: str) -> str:
+    """Convert katakana text to phonemes.
+    """
+    text = text.strip()
+    res = ''
+    while text:
+        if len(text) >= 2:
+            x = _RULEMAP2.get(text[:2])
+            if x is not None:
+                text = text[2:]
+                res += x
+                continue
+        x = _RULEMAP1.get(text[0])
+        if x is not None:
+            text = text[1:]
+            res += x
+            continue
+        res += ' ' + text[0]
+        text = text[1:]
+    res = _COLON_RX.sub(':', res)
+    return res[1:]
+
+_KATAKANA = ''.join(chr(ch) for ch in range(ord('ァ'), ord('ン') + 1))
+_HIRAGANA = ''.join(chr(ch) for ch in range(ord('ぁ'), ord('ん') + 1))
+_HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA)
+
+def hira2kata(text: str) -> str:
+    text = text.translate(_HIRA2KATATRANS)
+    return text.replace('う゛', 'ヴ')
+
+_SYMBOL_TOKENS = set(list('・、。？！'))
+_NO_YOMI_TOKENS = set(list('「」『』―（）［］[]　…'))
+_TAGGER = MeCab.Tagger()
+
+def text2kata(text: str) -> str:
+    parsed = _TAGGER.parse(text)
+    res = []
+    for line in parsed.split('\n'):
+        if line == 'EOS':
+            break
+        parts = line.split('\t')
+
+        word, yomi = parts[0], parts[1]
+        if yomi:
+            res.append(yomi)
+        else:
+            if word in _SYMBOL_TOKENS:
+                res.append(word)
+            elif word == 'っ' or word == 'ッ':
+                res.append('ッ')
+            elif word in _NO_YOMI_TOKENS:
+                pass
+            else:
+                res.append(word)
+    return hira2kata(''.join(res))
+
+def japanese_text2phone(text: str) -> str:
+    """Convert Japanese text to phonemes.
+    """
+    res = text2kata(text)
+    res = kata2phoneme(res)
+    return res.replace(' ', '')
\ No newline at end of file
diff --git a/TTS/tts/utils/japanese/text_test.py b/TTS/tts/utils/japanese/text_test.py
new file mode 100644
index 00000000..7a04925a
--- /dev/null
+++ b/TTS/tts/utils/japanese/text_test.py
@@ -0,0 +1,22 @@
+import unittest
+from . import japanese_text2phone
+
+_TEST_CASES = '''
+どちらに行きますか？/dochiraniikimasuka?
+今日は温泉に、行きます。/kyo:waoNseNni,ikimasu.
+「A」から「Z」までです。/AkaraZmadedesu.
+そうですね！/so:desune!
+クジラは哺乳類です。/kujirawahonyu:ruidesu.
+ヴィディオを見ます。/bidioomimasu.
+ky o: w a o N s e N n i , i k i m a s u ./kyo:waoNseNni,ikimasu.
+'''
+
+class TestText(unittest.TestCase):
+
+    def test_text2phone(self):
+        for line in _TEST_CASES.strip().split('\n'):
+            text, phone = line.split('/')
+            self.assertEqual(japanese_text2phone(text), phone)
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py
index 9367e6e2..9b63e7f1 100644
--- a/TTS/tts/utils/text/__init__.py
+++ b/TTS/tts/utils/text/__init__.py
@@ -39,6 +39,11 @@ def text2phone(text, language):
     if language == "zh-CN":
         ph = chinese_text_to_phonemes(text)
         return ph
+    elif language == "ja-jp":
+        from TTS.tts.utils.japanese import japanese_text2phone
+        ph = japanese_text2phone(text)
+        return ph
+
     raise ValueError(f" [!] Language {language} is not supported for phonemization.")
 
 
diff --git a/requirements.txt b/requirements.txt
index c6ce7672..7f45f9e0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,3 +19,5 @@ numba==0.52
 umap-learn==0.4.6
 unidecode==0.4.20
 coqpit
+mecab-python3
+unidic-lite

From f921a05bdb6ce6fc950c290b8a3aec613a7f70fe Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Wed, 26 May 2021 19:02:16 +0900
Subject: [PATCH 04/36] Fixed lint errors

---
 TTS/tts/utils/japanese/__init__.py  | 2 +-
 TTS/tts/utils/japanese/text.py      | 5 ++---
 TTS/tts/utils/japanese/text_test.py | 2 +-
 TTS/tts/utils/text/__init__.py      | 3 ++-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/TTS/tts/utils/japanese/__init__.py b/TTS/tts/utils/japanese/__init__.py
index 0ce7a99d..30d963e8 100644
--- a/TTS/tts/utils/japanese/__init__.py
+++ b/TTS/tts/utils/japanese/__init__.py
@@ -1 +1 @@
-from .text import japanese_text2phone
\ No newline at end of file
+from .text import japanese_text2phone
diff --git a/TTS/tts/utils/japanese/text.py b/TTS/tts/utils/japanese/text.py
index 4c8936ac..3a705352 100644
--- a/TTS/tts/utils/japanese/text.py
+++ b/TTS/tts/utils/japanese/text.py
@@ -3,7 +3,6 @@
 
 import re
 import MeCab
-from typing import List, Tuple
 
 _CONVRULES = [
     # Conversion of 2 letters
@@ -364,7 +363,7 @@ def text2kata(text: str) -> str:
         else:
             if word in _SYMBOL_TOKENS:
                 res.append(word)
-            elif word == 'っ' or word == 'ッ':
+            elif word in ('っ', 'ッ'):
                 res.append('ッ')
             elif word in _NO_YOMI_TOKENS:
                 pass
@@ -377,4 +376,4 @@ def japanese_text2phone(text: str) -> str:
     """
     res = text2kata(text)
     res = kata2phoneme(res)
-    return res.replace(' ', '')
\ No newline at end of file
+    return res.replace(' ', '')
diff --git a/TTS/tts/utils/japanese/text_test.py b/TTS/tts/utils/japanese/text_test.py
index 7a04925a..d3ade826 100644
--- a/TTS/tts/utils/japanese/text_test.py
+++ b/TTS/tts/utils/japanese/text_test.py
@@ -19,4 +19,4 @@ class TestText(unittest.TestCase):
             self.assertEqual(japanese_text2phone(text), phone)
 
 if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py
index 9b63e7f1..d7423102 100644
--- a/TTS/tts/utils/text/__init__.py
+++ b/TTS/tts/utils/text/__init__.py
@@ -39,7 +39,8 @@ def text2phone(text, language):
     if language == "zh-CN":
         ph = chinese_text_to_phonemes(text)
         return ph
-    elif language == "ja-jp":
+
+    if language == "ja-jp":
         from TTS.tts.utils.japanese import japanese_text2phone
         ph = japanese_text2phone(text)
         return ph

From c6f22aaa67f98aebbf8900c9244b4814e80bac86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 27 May 2021 11:46:33 +0200
Subject: [PATCH 05/36] fix #509

---
 TTS/tts/configs/shared_configs.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py
index 6c710ca2..4690e76f 100644
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@@ -80,12 +80,12 @@ class CharactersConfig(Coqpit):
     ):
         """Check config fields"""
         c = asdict(self)
-        check_argument("pad", c, "characters", restricted=True)
-        check_argument("eos", c, "characters", restricted=True)
-        check_argument("bos", c, "characters", restricted=True)
-        check_argument("characters", c, "characters", restricted=True)
+        check_argument("pad", c, prerequest="characters", restricted=True)
+        check_argument("eos", c, prerequest="characters", restricted=True)
+        check_argument("bos", c, prerequest="characters", restricted=True)
+        check_argument("characters", c, prerequest="characters", restricted=True)
         check_argument("phonemes", c, restricted=True)
-        check_argument("punctuations", c, "characters", restricted=True)
+        check_argument("punctuations", c, prerequest="characters", restricted=True)
 
 
 @dataclass

From e08c58db3b23a832b68d5aa605ff5d0a308e61cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 27 May 2021 13:11:01 +0200
Subject: [PATCH 06/36] bump up version to v0.14.1

---
 TTS/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/_version.py b/TTS/_version.py
index 311f216e..f4956698 100644
--- a/TTS/_version.py
+++ b/TTS/_version.py
@@ -1 +1 @@
-__version__ = "0.0.14"
+__version__ = "0.0.14.1"

From 925c08cf95386c936e7e7f979f6b536b2440ec5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 21 May 2021 00:30:39 +0200
Subject: [PATCH 07/36] replace unidecode with anyascii

---
 TTS/tts/utils/text/cleaners.py              | 16 ++--------------
 requirements.txt                            |  2 +-
 tests/tts_tests/test_glow_tts_train.py      |  2 +-
 tests/tts_tests/test_speedy_speech_train.py |  2 +-
 tests/vocoder_tests/test_melgan_train.py    |  1 +
 5 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py
index 2eddcdb8..3d2caa97 100644
--- a/TTS/tts/utils/text/cleaners.py
+++ b/TTS/tts/utils/text/cleaners.py
@@ -1,18 +1,6 @@
-"""
-Cleaners are transformations that run over the input text at both training and eval time.
-
-Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
-hyperparameter. Some cleaners are English-specific. You'll typically want to use:
-  1. "english_cleaners" for English text
-  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
-     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
-  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
-     the symbols in symbols.py to match your data).
-"""
-
 import re
 
-from unidecode import unidecode
+from anyascii import anyascii
 
 from TTS.tts.utils.text.chinese_mandarin.numbers import replace_numbers_to_characters_in_text
 
@@ -47,7 +35,7 @@ def collapse_whitespace(text):
 
 
 def convert_to_ascii(text):
-    return unidecode(text)
+    return anyascii(text)
 
 
 def remove_aux_symbols(text):
diff --git a/requirements.txt b/requirements.txt
index c6ce7672..b376eb1b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,5 +17,5 @@ torch>=1.7
 tqdm
 numba==0.52
 umap-learn==0.4.6
-unidecode==0.4.20
+anyascii
 coqpit
diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py
index 00c7e852..2e675d13 100644
--- a/tests/tts_tests/test_glow_tts_train.py
+++ b/tests/tts_tests/test_glow_tts_train.py
@@ -17,7 +17,7 @@ config = GlowTTSConfig(
     text_cleaner="english_cleaners",
     use_phonemes=True,
     phoneme_language="zh-CN",
-    phoneme_cache_path='tests/data/ljspeech/phoneme_cache/',
+    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
     run_eval=True,
     test_delay_epochs=-1,
     epochs=1,
diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py
index cc2845c2..3f508117 100644
--- a/tests/tts_tests/test_speedy_speech_train.py
+++ b/tests/tts_tests/test_speedy_speech_train.py
@@ -17,7 +17,7 @@ config = SpeedySpeechConfig(
     text_cleaner="english_cleaners",
     use_phonemes=True,
     phoneme_language="zh-CN",
-    phoneme_cache_path='tests/data/ljspeech/phoneme_cache/',
+    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
     run_eval=True,
     test_delay_epochs=-1,
     epochs=1,
diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py
index de48ca24..3ff65b5a 100644
--- a/tests/vocoder_tests/test_melgan_train.py
+++ b/tests/vocoder_tests/test_melgan_train.py
@@ -19,6 +19,7 @@ config = MelganConfig(
     seq_len=2048,
     eval_split_size=1,
     print_step=1,
+    discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]},
     print_eval=True,
     data_path="tests/data/ljspeech",
     output_path=output_path,

From c4987e9d4e503628df5661c0945b2047ea046b1f Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Fri, 28 May 2021 00:22:57 +0900
Subject: [PATCH 08/36] Move import at the head of the file.

---
 TTS/tts/utils/text/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py
index d7423102..f6b46783 100644
--- a/TTS/tts/utils/text/__init__.py
+++ b/TTS/tts/utils/text/__init__.py
@@ -6,6 +6,7 @@ from packaging import version
 
 from TTS.tts.utils.text import cleaners
 from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes
+from TTS.tts.utils.japanese import japanese_text2phone
 from TTS.tts.utils.text.symbols import _bos, _eos, _punctuations, make_symbols, phonemes, symbols
 
 # pylint: disable=unnecessary-comprehension
@@ -41,7 +42,6 @@ def text2phone(text, language):
         return ph
 
     if language == "ja-jp":
-        from TTS.tts.utils.japanese import japanese_text2phone
         ph = japanese_text2phone(text)
         return ph
 

From d0c9c1ca5c28d37845ea7a19d399851c5bfd5429 Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Sat, 29 May 2021 09:21:47 +0900
Subject: [PATCH 09/36] Move TTS/tts/utils/japanese

---
 TTS/tts/utils/japanese/__init__.py                          | 1 -
 TTS/tts/utils/text/__init__.py                              | 4 ++--
 TTS/tts/utils/text/japanese/__init__.py                     | 0
 .../utils/{japanese/text.py => text/japanese/phonemizer.py} | 2 +-
 .../text_test.py => text/japanese/phonemizer_test.py}       | 6 +++---
 5 files changed, 6 insertions(+), 7 deletions(-)
 delete mode 100644 TTS/tts/utils/japanese/__init__.py
 create mode 100644 TTS/tts/utils/text/japanese/__init__.py
 rename TTS/tts/utils/{japanese/text.py => text/japanese/phonemizer.py} (99%)
 rename TTS/tts/utils/{japanese/text_test.py => text/japanese/phonemizer_test.py} (77%)

diff --git a/TTS/tts/utils/japanese/__init__.py b/TTS/tts/utils/japanese/__init__.py
deleted file mode 100644
index 30d963e8..00000000
--- a/TTS/tts/utils/japanese/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .text import japanese_text2phone
diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py
index f6b46783..f9f44167 100644
--- a/TTS/tts/utils/text/__init__.py
+++ b/TTS/tts/utils/text/__init__.py
@@ -6,7 +6,7 @@ from packaging import version
 
 from TTS.tts.utils.text import cleaners
 from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes
-from TTS.tts.utils.japanese import japanese_text2phone
+from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
 from TTS.tts.utils.text.symbols import _bos, _eos, _punctuations, make_symbols, phonemes, symbols
 
 # pylint: disable=unnecessary-comprehension
@@ -42,7 +42,7 @@ def text2phone(text, language):
         return ph
 
     if language == "ja-jp":
-        ph = japanese_text2phone(text)
+        ph = japanese_text_to_phonemes(text)
         return ph
 
     raise ValueError(f" [!] Language {language} is not supported for phonemization.")
diff --git a/TTS/tts/utils/text/japanese/__init__.py b/TTS/tts/utils/text/japanese/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/TTS/tts/utils/japanese/text.py b/TTS/tts/utils/text/japanese/phonemizer.py
similarity index 99%
rename from TTS/tts/utils/japanese/text.py
rename to TTS/tts/utils/text/japanese/phonemizer.py
index 3a705352..f09d5b05 100644
--- a/TTS/tts/utils/japanese/text.py
+++ b/TTS/tts/utils/text/japanese/phonemizer.py
@@ -371,7 +371,7 @@ def text2kata(text: str) -> str:
                 res.append(word)
     return hira2kata(''.join(res))
 
-def japanese_text2phone(text: str) -> str:
+def japanese_text_to_phonemes(text: str) -> str:
     """Convert Japanese text to phonemes.
     """
     res = text2kata(text)
diff --git a/TTS/tts/utils/japanese/text_test.py b/TTS/tts/utils/text/japanese/phonemizer_test.py
similarity index 77%
rename from TTS/tts/utils/japanese/text_test.py
rename to TTS/tts/utils/text/japanese/phonemizer_test.py
index d3ade826..f07c0901 100644
--- a/TTS/tts/utils/japanese/text_test.py
+++ b/TTS/tts/utils/text/japanese/phonemizer_test.py
@@ -1,5 +1,5 @@
 import unittest
-from . import japanese_text2phone
+from .phonemizer import japanese_text_to_phonemes
 
 _TEST_CASES = '''
 どちらに行きますか？/dochiraniikimasuka?
@@ -13,10 +13,10 @@ ky o: w a o N s e N n i , i k i m a s u ./kyo:waoNseNni,ikimasu.
 
 class TestText(unittest.TestCase):
 
-    def test_text2phone(self):
+    def test_japanese_text_to_phonemes(self):
         for line in _TEST_CASES.strip().split('\n'):
             text, phone = line.split('/')
-            self.assertEqual(japanese_text2phone(text), phone)
+            self.assertEqual(japanese_text_to_phonemes(text), phone)
 
 if __name__ == '__main__':
     unittest.main()

From 29d61741ecdc9c377cf3ff3bda622233304e7127 Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Sat, 29 May 2021 19:03:23 +0900
Subject: [PATCH 10/36] Copied recipe

---
 recipes/kokoro/tacotron2-DDC/run.sh           | 22 +++++
 .../kokoro/tacotron2-DDC/tacotron2-DDC.json   | 91 +++++++++++++++++++
 2 files changed, 113 insertions(+)
 create mode 100644 recipes/kokoro/tacotron2-DDC/run.sh
 create mode 100644 recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json

diff --git a/recipes/kokoro/tacotron2-DDC/run.sh b/recipes/kokoro/tacotron2-DDC/run.sh
new file mode 100644
index 00000000..eaa05b60
--- /dev/null
+++ b/recipes/kokoro/tacotron2-DDC/run.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# take the scripts's parent's directory to prefix all the output paths.
+RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+echo $RUN_DIR
+# download LJSpeech dataset
+wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
+# extract
+tar -xjf LJSpeech-1.1.tar.bz2
+# create train-val splits
+shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
+head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
+tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
+mv LJSpeech-1.1 $RUN_DIR/
+rm LJSpeech-1.1.tar.bz2
+# compute dataset mean and variance for normalization
+python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/LJSpeech-1.1/wavs/
+# training ....
+# change the GPU id if needed
+CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tacotron.py --config_path $RUN_DIR/tacotron2-DDC.json \
+                                                          --coqpit.output_path $RUN_DIR  \
+                                                          --coqpit.datasets.0.path $RUN_DIR/LJSpeech-1.1/    \
+                                                          --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \
\ No newline at end of file
diff --git a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
new file mode 100644
index 00000000..9cdbbd3b
--- /dev/null
+++ b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
@@ -0,0 +1,91 @@
+{
+    "datasets": [
+        {
+            "name": "ljspeech",
+            "path": "DEFINE THIS",
+            "meta_file_train": "metadata.csv",
+            "meta_file_val": null
+        }
+    ],
+    "audio": {
+        "fft_size": 1024,
+        "win_length": 1024,
+        "hop_length": 256,
+        "frame_length_ms": null,
+        "frame_shift_ms": null,
+        "sample_rate": 22050,
+        "preemphasis": 0.0,
+        "ref_level_db": 20,
+        "do_trim_silence": true,
+        "trim_db": 60,
+        "power": 1.5,
+        "griffin_lim_iters": 60,
+        "num_mels": 80,
+        "mel_fmin": 50.0,
+        "mel_fmax": 7600.0,
+        "spec_gain": 1,
+        "signal_norm": true,
+        "min_level_db": -100,
+        "symmetric_norm": true,
+        "max_norm": 4.0,
+        "clip_norm": true,
+        "stats_path": "scale_stats.npy"
+    },
+    "gst":{
+        "gst_embedding_dim": 256,
+        "gst_num_heads": 4,
+        "gst_num_style_tokens": 10
+    },
+    "model": "Tacotron2",
+    "run_name": "ljspeech-ddc",
+    "run_description": "tacotron2 with double decoder consistency.",
+    "batch_size": 64,
+    "eval_batch_size": 16,
+    "mixed_precision": true,
+    "loss_masking": true,
+    "decoder_loss_alpha": 0.25,
+    "postnet_loss_alpha": 0.25,
+    "postnet_diff_spec_alpha": 0.25,
+    "decoder_diff_spec_alpha": 0.25,
+    "decoder_ssim_alpha": 0.25,
+    "postnet_ssim_alpha": 0.25,
+    "ga_alpha": 5.0,
+    "stopnet_pos_weight": 15.0,
+    "run_eval": true,
+    "test_delay_epochs": 10,
+    "test_sentences_file": null,
+    "noam_schedule": true,
+    "grad_clip": 0.05,
+    "epochs": 1000,
+    "lr": 0.001,
+    "wd": 1e-06,
+    "warmup_steps": 4000,
+    "memory_size": -1,
+    "prenet_type": "original",
+    "prenet_dropout": true,
+    "attention_type": "original",
+    "location_attn": true,
+    "double_decoder_consistency": true,
+    "ddc_r": 6,
+    "attention_norm": "sigmoid",
+    "r": 6,
+    "gradual_training": [[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
+    "stopnet": true,
+    "separate_stopnet": true,
+    "print_step": 25,
+    "tb_plot_step": 100,
+    "print_eval": false,
+    "save_step": 10000,
+    "checkpoint": true,
+    "text_cleaner": "phoneme_cleaners",
+    "num_loader_workers": 4,
+    "num_val_loader_workers": 4,
+    "batch_group_size": 4,
+    "min_seq_len": 6,
+    "max_seq_len": 180,
+    "compute_input_seq_cache": true,
+    "output_path": "DEFINE THIS",
+    "phoneme_cache_path": "DEFINE THIS",
+    "use_phonemes": false,
+    "phoneme_language": "en-us"
+}
\ No newline at end of file

From c4a5a73f186c40dc80c043edf4300198781769d6 Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Sat, 29 May 2021 19:17:27 +0900
Subject: [PATCH 11/36] update Kokoro config

---
 TTS/tts/configs/kokoro_tacotron2.json         | 173 ------------------
 .../kokoro/tacotron2-DDC/tacotron2-DDC.json   |  82 ++++++---
 2 files changed, 58 insertions(+), 197 deletions(-)
 delete mode 100644 TTS/tts/configs/kokoro_tacotron2.json

diff --git a/TTS/tts/configs/kokoro_tacotron2.json b/TTS/tts/configs/kokoro_tacotron2.json
deleted file mode 100644
index f5d41194..00000000
--- a/TTS/tts/configs/kokoro_tacotron2.json
+++ /dev/null
@@ -1,173 +0,0 @@
-{
-    "model": "Tacotron2",
-    "run_name": "kokoro-ddc",
-    "run_description": "tacotron2 with DDC and differential spectral loss.",
-
-    // AUDIO PARAMETERS
-    "audio":{
-        // stft parameters
-        "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
-        "win_length": 1024,      // stft window length in ms.
-        "hop_length": 256,       // stft window hop-lengh in ms.
-        "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
-        "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
-
-        // Audio processing parameters
-        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate.
-        "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
-        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
-
-        // Silence trimming
-        "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
-        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
-
-        // Griffin-Lim
-        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
-        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
-
-        // MelSpectrogram parameters
-        "num_mels": 80,         // size of the mel spec frame.
-        "mel_fmin": 50.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
-        "mel_fmax": 7600.0,     // maximum freq level for mel-spec. Tune for dataset!!
-        "spec_gain": 1,
-
-        // Normalization parameters
-        "signal_norm": true,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
-        "min_level_db": -100,   // lower bound for normalization
-        "symmetric_norm": true, // move normalization to range [-1, 1]
-        "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
-        "clip_norm": true,      // clip normalized values into the range.
-        "stats_path": "./scale_stats.npy"    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
-    },
-
-    // VOCABULARY PARAMETERS
-    // if custom character set is not defined,
-    // default set in symbols.py is used
-    "characters":{
-        "pad": "_",
-        "eos": "~",
-        "bos": "^",
-        "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
-        "punctuations": "!'(),-.:;? ",
-        "phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
-    },
-
-    // DISTRIBUTED TRAINING
-    "distributed":{
-        "backend": "nccl",
-        "url": "tcp:\/\/localhost:54321"
-    },
-
-    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
-
-    // TRAINING
-    "batch_size": 32,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
-    "eval_batch_size":16,
-    "r": 7,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
-    "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
-    "mixed_precision": true,     // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate.
-
-    // LOSS SETTINGS
-    "loss_masking": true,       // enable / disable loss masking against the sequence padding.
-    "decoder_loss_alpha": 0.5,  // original decoder loss weight. If > 0, it is enabled
-    "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled
-    "postnet_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
-    "decoder_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
-    "decoder_ssim_alpha": 0.5,     // decoder ssim loss weight. If > 0, it is enabled
-    "postnet_ssim_alpha": 0.25,     // postnet ssim loss weight. If > 0, it is enabled
-    "ga_alpha": 5.0,           // weight for guided attention loss. If > 0, guided attention is enabled.
-    "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples.
-
-
-    // VALIDATION
-    "run_eval": true,
-    "test_delay_epochs": 10,  //Until attention is aligned, testing only wastes computation time.
-    "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
-
-    // OPTIMIZER
-    "noam_schedule": false,        // use noam warmup and lr schedule.
-    "grad_clip": 1.0,              // upper limit for gradients for clipping.
-    "epochs": 1000,                // total number of epochs to train.
-    "lr": 0.0001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
-    "wd": 0.000001,                // Weight decay weight.
-    "warmup_steps": 4000,          // Noam decay steps to increase the learning rate from 0 to "lr"
-    "seq_len_norm": false,         // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
-
-    // TACOTRON PRENET
-    "memory_size": -1,             // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
-    "prenet_type": "original",     // "original" or "bn".
-    "prenet_dropout": true,       // enable/disable dropout at prenet.
-
-    // TACOTRON ATTENTION
-    "attention_type": "original",  // 'original' , 'graves', 'dynamic_convolution'
-    "attention_heads": 4,          // number of attention heads (only for 'graves')
-    "attention_norm": "sigmoid",   // softmax or sigmoid.
-    "windowing": false,            // Enables attention windowing. Used only in eval mode.
-    "use_forward_attn": false,     // if it uses forward attention. In general, it aligns faster.
-    "forward_attn_mask": false,    // Additional masking forcing monotonicity only in eval mode.
-    "transition_agent": false,     // enable/disable transition agent of forward attention.
-    "location_attn": true,         // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
-    "bidirectional_decoder": false,  // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
-    "double_decoder_consistency": true,  // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
-    "ddc_r": 7,                           // reduction rate for coarse decoder.
-
-    // STOPNET
-    "stopnet": true,               // Train stopnet predicting the end of synthesis.
-    "separate_stopnet": true,      // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
-
-    // TENSORBOARD and LOGGING
-    "print_step": 25,       // Number of steps to log training on console.
-    "tb_plot_step": 100,    // Number of steps to plot TB training figures.
-    "print_eval": false,     // If True, it prints intermediate loss values in evalulation.
-    "save_step": 10000,      // Number of training steps expected to save traninpg stats and checkpoints.
-    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
-    "keep_all_best": false,  // If true, keeps all best_models after keep_after steps
-    "keep_after": 10000,    // Global step after which to keep best models if keep_all_best is true
-    "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
-
-    // DATA LOADING
-    "text_cleaner": "basic_cleaners",
-    "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
-    "num_loader_workers": 4,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
-    "num_val_loader_workers": 4,    // number of evaluation data loader processes.
-    "batch_group_size": 4,  //Number of batches to shuffle after bucketing.
-    "min_seq_len": 6,       // DATASET-RELATED: minimum text length to use in training
-    "max_seq_len": 153,     // DATASET-RELATED: maximum text length
-    "compute_input_seq_cache": false,  // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
-    "use_noise_augment": true,
-
-    // PATHS
-    "output_path": "./Models/Kokoro/",
-
-    // PHONEMES
-    "phoneme_cache_path": "./phoneme_cache/",  // phoneme computation is slow, therefore, it caches results in the given folder.
-    "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
-    "phoneme_language": "ja-jp",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
-
-    // MULTI-SPEAKER and GST
-    "use_speaker_embedding": false,      // use speaker embedding to enable multi-speaker learning.
-    "use_gst": false,       			    // use global style tokens
-    "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
-    "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
-    "gst":	{			                // gst parameter if gst is enabled
-        "gst_style_input": null,        // Condition the style input either on a
-                                        // -> wave file [path to wave] or
-                                        // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
-                                        // with the dictionary being len(dict) <= len(gst_style_tokens).
-        "gst_embedding_dim": 512,
-        "gst_num_heads": 4,
-        "gst_style_tokens": 10,
-        "gst_use_speaker_embedding": false
-	},
-
-    // DATASETS
-    "datasets":   // List of datasets. They all merged and they get different speaker_ids.
-        [
-            {
-                "name": "kokoro",
-                "path": "./kokoro-speech-v1_1-small/",
-                "meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers
-                "meta_file_val": null
-            }
-        ]
-}
diff --git a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
index 9cdbbd3b..1aaec547 100644
--- a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
+++ b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
@@ -1,8 +1,8 @@
 {
     "datasets": [
         {
-            "name": "ljspeech",
-            "path": "DEFINE THIS",
+            "name": "kokoro",
+            "path": "./kokoro-speech-v1_1-tiny/",
             "meta_file_train": "metadata.csv",
             "meta_file_val": null
         }
@@ -32,44 +32,61 @@
         "stats_path": "scale_stats.npy"
     },
     "gst":{
-        "gst_embedding_dim": 256,
+        "gst_style_input": null,
+
+
+
+        "gst_embedding_dim": 512,
         "gst_num_heads": 4,
-        "gst_num_style_tokens": 10
-    },
+        "gst_style_tokens": 10,
+        "gst_use_speaker_embedding": false
+	},
     "model": "Tacotron2",
-    "run_name": "ljspeech-ddc",
-    "run_description": "tacotron2 with double decoder consistency.",
-    "batch_size": 64,
+    "run_name": "kokoro-ddc",
+    "run_description": "tacotron2 with DDC and differential spectral loss.",
+    "batch_size": 32,
     "eval_batch_size": 16,
     "mixed_precision": true,
+    "distributed": {
+        "backend": "nccl",
+        "url": "tcp:\/\/localhost:54321"
+    },
+    "reinit_layers": [],
     "loss_masking": true,
-    "decoder_loss_alpha": 0.25,
+    "decoder_loss_alpha": 0.5,
     "postnet_loss_alpha": 0.25,
     "postnet_diff_spec_alpha": 0.25,
     "decoder_diff_spec_alpha": 0.25,
-    "decoder_ssim_alpha": 0.25,
+    "decoder_ssim_alpha": 0.5,
     "postnet_ssim_alpha": 0.25,
     "ga_alpha": 5.0,
     "stopnet_pos_weight": 15.0,
     "run_eval": true,
     "test_delay_epochs": 10,
     "test_sentences_file": null,
-    "noam_schedule": true,
-    "grad_clip": 0.05,
+    "noam_schedule": false,
+    "grad_clip": 1.0,
     "epochs": 1000,
-    "lr": 0.001,
-    "wd": 1e-06,
+    "lr": 0.0001,
+    "wd": 0.000001,
     "warmup_steps": 4000,
+    "seq_len_norm": false,
     "memory_size": -1,
     "prenet_type": "original",
     "prenet_dropout": true,
     "attention_type": "original",
+    "windowing": false,
+    "use_forward_attn": false,
+    "forward_attn_mask": false,
+    "transition_agent": false,
     "location_attn": true,
+    "bidirectional_decoder": false,
     "double_decoder_consistency": true,
-    "ddc_r": 6,
+    "ddc_r": 7,
+    "attention_heads": 4,
     "attention_norm": "sigmoid",
-    "r": 6,
-    "gradual_training": [[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
+    "r": 7,
+    "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]],
     "stopnet": true,
     "separate_stopnet": true,
     "print_step": 25,
@@ -77,15 +94,32 @@
     "print_eval": false,
     "save_step": 10000,
     "checkpoint": true,
-    "text_cleaner": "phoneme_cleaners",
+    "keep_all_best": false,
+    "keep_after": 10000,
+    "tb_model_param_stats": false,
+    "text_cleaner": "basic_cleaners",
+    "enable_eos_bos_chars": false,
     "num_loader_workers": 4,
     "num_val_loader_workers": 4,
     "batch_group_size": 4,
     "min_seq_len": 6,
-    "max_seq_len": 180,
-    "compute_input_seq_cache": true,
-    "output_path": "DEFINE THIS",
-    "phoneme_cache_path": "DEFINE THIS",
-    "use_phonemes": false,
-    "phoneme_language": "en-us"
+    "max_seq_len": 153,
+    "compute_input_seq_cache": false,
+    "use_noise_augment": true,
+    "output_path": "./Models/Kokoro/",
+    "phoneme_cache_path": "./phoneme_cache/",
+    "use_phonemes": true,
+    "phoneme_language": "ja-jp",
+    "characters": {
+        "pad": "_",
+        "eos": "~",
+        "bos": "^",
+        "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
+        "punctuations": "!'(),-.:;? ",
+        "phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+    },
+    "use_speaker_embedding": false,
+    "use_gst": false,       			
+    "use_external_speaker_embedding_file": false,
+    "external_speaker_embedding_file": "../../speakers-vctk-en.json"
 }
\ No newline at end of file

From 88f3255962073d84d1c7d559b956a0330a6fd11d Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Sat, 29 May 2021 19:39:51 +0900
Subject: [PATCH 12/36] Update Kokoro recipe

---
 recipes/kokoro/tacotron2-DDC/run.sh           | 27 ++++++++++---------
 .../kokoro/tacotron2-DDC/tacotron2-DDC.json   |  6 ++---
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/recipes/kokoro/tacotron2-DDC/run.sh b/recipes/kokoro/tacotron2-DDC/run.sh
index eaa05b60..cd2aaff5 100644
--- a/recipes/kokoro/tacotron2-DDC/run.sh
+++ b/recipes/kokoro/tacotron2-DDC/run.sh
@@ -1,22 +1,23 @@
 #!/bin/bash
 # take the scripts's parent's directory to prefix all the output paths.
 RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+CORPUS=kokoro-speech-v1_1-tiny
 echo $RUN_DIR
-# download LJSpeech dataset
-wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
-# extract
-tar -xjf LJSpeech-1.1.tar.bz2
+if [ \! -d $RUN_DIR/$CORPUS ] ; then
+    echo "$RUN_DIR/$CORPUS doesn't exist."
+    echo "Follow the instruction of https://github.com/kaiidams/Kokoro-Speech-Dataset to make the corpus."
+    exit 1
+fi
 # create train-val splits
-shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
-head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
-tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
-mv LJSpeech-1.1 $RUN_DIR/
-rm LJSpeech-1.1.tar.bz2
+shuf $RUN_DIR/$CORPUS/metadata.csv > $RUN_DIR/$CORPUS/metadata_shuf.csv
+head -n 8000 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_train.csv
+tail -n 812 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_val.csv
 # compute dataset mean and variance for normalization
-python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/LJSpeech-1.1/wavs/
+python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/$CORPUS/wavs/
 # training ....
 # change the GPU id if needed
 CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tacotron.py --config_path $RUN_DIR/tacotron2-DDC.json \
-                                                          --coqpit.output_path $RUN_DIR  \
-                                                          --coqpit.datasets.0.path $RUN_DIR/LJSpeech-1.1/    \
-                                                          --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \
\ No newline at end of file
+                                                          --coqpit.output_path $RUN_DIR \
+                                                          --coqpit.datasets.0.path $RUN_DIR/$CORPUS \
+                                                          --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \
+                                                          --coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \
\ No newline at end of file
diff --git a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
index 1aaec547..b3630055 100644
--- a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
+++ b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
@@ -2,7 +2,7 @@
     "datasets": [
         {
             "name": "kokoro",
-            "path": "./kokoro-speech-v1_1-tiny/",
+            "path": "DEFINE THIS",
             "meta_file_train": "metadata.csv",
             "meta_file_val": null
         }
@@ -106,8 +106,8 @@
     "max_seq_len": 153,
     "compute_input_seq_cache": false,
     "use_noise_augment": true,
-    "output_path": "./Models/Kokoro/",
-    "phoneme_cache_path": "./phoneme_cache/",
+    "output_path": "DEFINE THIS",
+    "phoneme_cache_path": "DEFINE THIS",
     "use_phonemes": true,
     "phoneme_language": "ja-jp",
     "characters": {

From 2091e808c82647787b571f1b17e80378d203e830 Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Sat, 29 May 2021 19:41:00 +0900
Subject: [PATCH 13/36] Fix path

---
 recipes/kokoro/tacotron2-DDC/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/kokoro/tacotron2-DDC/run.sh b/recipes/kokoro/tacotron2-DDC/run.sh
index cd2aaff5..86fda642 100644
--- a/recipes/kokoro/tacotron2-DDC/run.sh
+++ b/recipes/kokoro/tacotron2-DDC/run.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # take the scripts's parent's directory to prefix all the output paths.
 RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
-CORPUS=kokoro-speech-v1_1-tiny
+CORPUS=kokoro-speech-v1_1-small
 echo $RUN_DIR
 if [ \! -d $RUN_DIR/$CORPUS ] ; then
     echo "$RUN_DIR/$CORPUS doesn't exist."

From d9f1268f99f3a91078f9f4806e9283b615c51c03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Mon, 31 May 2021 15:47:07 +0200
Subject: [PATCH 14/36] init tb_logger None for rank > 0 processes

---
 TTS/utils/arguments.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py
index 1b5a424b..5e6acd1d 100644
--- a/TTS/utils/arguments.py
+++ b/TTS/utils/arguments.py
@@ -152,6 +152,7 @@ def process_args(args):
         experiment_path = create_experiment_folder(config.output_path, config.run_name, args.debug)
     audio_path = os.path.join(experiment_path, "test_audios")
     # setup rank 0 process in distributed training
+    tb_logger = None
     if args.rank == 0:
         os.makedirs(audio_path, exist_ok=True)
         new_fields = {}

From 975531b3f27510d3668218086923fde9f2a8562d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Mon, 31 May 2021 16:11:22 +0200
Subject: [PATCH 15/36] update `pylintrc` for torch and numpy functions

---
 .pylintrc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pylintrc b/.pylintrc
index 0bc0be4b..1b3d64c2 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -253,7 +253,7 @@ contextmanager-decorators=contextlib.contextmanager
 # List of members which are set dynamically and missed by pylint inference
 # system, and so shouldn't trigger E1101 when accessed. Python regular
 # expressions are accepted.
-generated-members=
+generated-members=numpy.*,torch.*
 
 # Tells whether missing members accessed in mixin class should be ignored. A
 # mixin class is detected if its name ends with "mixin" (case insensitive).

From bec85ac58d21536e8bbd395eac5f7b70a1618206 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Mon, 31 May 2021 16:37:15 +0200
Subject: [PATCH 16/36] make style

---
 TTS/bin/compute_embeddings.py                 |  4 +-
 TTS/bin/train_encoder.py                      |  9 +--
 TTS/speaker_encoder/dataset.py                | 49 +++++++------
 TTS/speaker_encoder/losses.py                 | 10 ++-
 TTS/speaker_encoder/models/resnet.py          | 37 ++++++----
 TTS/speaker_encoder/speaker_encoder_config.py |  5 +-
 TTS/speaker_encoder/utils/generic_utils.py    | 69 ++++++++++++-------
 tests/test_speaker_encoder.py                 |  5 ++
 tests/test_speaker_encoder_train.py           |  2 +-
 9 files changed, 115 insertions(+), 75 deletions(-)

diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py
index 003da1e5..872fc875 100644
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@@ -5,11 +5,11 @@ import os
 import torch
 from tqdm import tqdm
 
+from TTS.config import BaseDatasetConfig, load_config
 from TTS.speaker_encoder.utils.generic_utils import setup_model
 from TTS.tts.datasets.preprocess import load_meta_data
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.utils.audio import AudioProcessor
-from TTS.config import load_config, BaseDatasetConfig
 
 parser = argparse.ArgumentParser(
     description='Compute embedding vectors for each wav file in a dataset. If "target_dataset" is defined, it generates "speakers.json" necessary for training a multi-speaker model.'
@@ -100,7 +100,7 @@ for idx, wav_file in enumerate(tqdm(wav_files)):
 
 if speaker_mapping:
     # save speaker_mapping if target dataset is defined
-    if '.json' not in args.output_path:
+    if ".json" not in args.output_path:
         mapping_file_path = os.path.join(args.output_path, "speakers.json")
     else:
         mapping_file_path = args.output_path
diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py
index c9493535..48309dc9 100644
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@@ -10,10 +10,8 @@ import torch
 from torch.utils.data import DataLoader
 
 from TTS.speaker_encoder.dataset import SpeakerEncoderDataset
-
 from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
 from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model
-
 from TTS.speaker_encoder.utils.visual import plot_embeddings
 from TTS.tts.datasets.preprocess import load_meta_data
 from TTS.utils.arguments import init_training
@@ -45,7 +43,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False
             storage_size=c.storage["storage_size"],
             sample_from_storage_p=c.storage["sample_from_storage_p"],
             verbose=verbose,
-            augmentation_config=c.audio_augmentation
+            augmentation_config=c.audio_augmentation,
         )
 
         # sampler = DistributedSampler(dataset) if num_gpus > 1 else None
@@ -170,19 +168,18 @@ def main(args):  # pylint: disable=redefined-outer-name
     else:
         raise Exception("The %s  not is a loss supported" % c.loss)
 
-
     if args.restore_path:
         checkpoint = torch.load(args.restore_path)
         try:
             model.load_state_dict(checkpoint["model"])
 
-            if 'criterion' in checkpoint:
+            if "criterion" in checkpoint:
                 criterion.load_state_dict(checkpoint["criterion"])
 
         except (KeyError, RuntimeError):
             print(" > Partial model initialization.")
             model_dict = model.state_dict()
-            model_dict = set_init_dict(model_dict, checkpoint['model'], c)
+            model_dict = set_init_dict(model_dict, checkpoint["model"], c)
             model.load_state_dict(model_dict)
             del model_dict
         for group in optimizer.param_groups:
diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py
index cd95a4f5..6b2b0dd4 100644
--- a/TTS/speaker_encoder/dataset.py
+++ b/TTS/speaker_encoder/dataset.py
@@ -1,24 +1,25 @@
-
 import random
 
 import numpy as np
 import torch
 from torch.utils.data import Dataset
+
 from TTS.speaker_encoder.utils.generic_utils import AugmentWAV, Storage
 
+
 class SpeakerEncoderDataset(Dataset):
     def __init__(
-            self,
-            ap,
-            meta_data,
-            voice_len=1.6,
-            num_speakers_in_batch=64,
-            storage_size=1,
-            sample_from_storage_p=0.5,
-            num_utter_per_speaker=10,
-            skip_speakers=False,
-            verbose=False,
-            augmentation_config=None
+        self,
+        ap,
+        meta_data,
+        voice_len=1.6,
+        num_speakers_in_batch=64,
+        storage_size=1,
+        sample_from_storage_p=0.5,
+        num_utter_per_speaker=10,
+        skip_speakers=False,
+        verbose=False,
+        augmentation_config=None,
     ):
         """
         Args:
@@ -38,23 +39,25 @@ class SpeakerEncoderDataset(Dataset):
         self.verbose = verbose
         self.__parse_items()
         storage_max_size = storage_size * num_speakers_in_batch
-        self.storage = Storage(maxsize=storage_max_size, storage_batchs=storage_size, num_speakers_in_batch=num_speakers_in_batch)
+        self.storage = Storage(
+            maxsize=storage_max_size, storage_batchs=storage_size, num_speakers_in_batch=num_speakers_in_batch
+        )
         self.sample_from_storage_p = float(sample_from_storage_p)
 
         speakers_aux = list(self.speakers)
         speakers_aux.sort()
-        self.speakerid_to_classid = {key : i for i, key in enumerate(speakers_aux)}
+        self.speakerid_to_classid = {key: i for i, key in enumerate(speakers_aux)}
 
         # Augmentation
         self.augmentator = None
         self.gaussian_augmentation_config = None
         if augmentation_config:
-            self.data_augmentation_p = augmentation_config['p']
-            if self.data_augmentation_p and ('additive' in augmentation_config or 'rir' in augmentation_config):
+            self.data_augmentation_p = augmentation_config["p"]
+            if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
                 self.augmentator = AugmentWAV(ap, augmentation_config)
 
-            if 'gaussian' in augmentation_config.keys():
-                self.gaussian_augmentation_config = augmentation_config['gaussian']
+            if "gaussian" in augmentation_config.keys():
+                self.gaussian_augmentation_config = augmentation_config["gaussian"]
 
         if self.verbose:
             print("\n > DataLoader initialization")
@@ -231,9 +234,13 @@ class SpeakerEncoderDataset(Dataset):
                 offset = random.randint(0, wav.shape[0] - self.seq_len)
                 wav = wav[offset : offset + self.seq_len]
                 # add random gaussian noise
-                if self.gaussian_augmentation_config and self.gaussian_augmentation_config['p']:
-                    if random.random() < self.gaussian_augmentation_config['p']:
-                        wav += np.random.normal(self.gaussian_augmentation_config['min_amplitude'], self.gaussian_augmentation_config['max_amplitude'], size=len(wav))
+                if self.gaussian_augmentation_config and self.gaussian_augmentation_config["p"]:
+                    if random.random() < self.gaussian_augmentation_config["p"]:
+                        wav += np.random.normal(
+                            self.gaussian_augmentation_config["min_amplitude"],
+                            self.gaussian_augmentation_config["max_amplitude"],
+                            size=len(wav),
+                        )
                 mel = self.ap.melspectrogram(wav)
                 feats_.append(torch.FloatTensor(mel))
 
diff --git a/TTS/speaker_encoder/losses.py b/TTS/speaker_encoder/losses.py
index 9b573b6d..ac7e62bf 100644
--- a/TTS/speaker_encoder/losses.py
+++ b/TTS/speaker_encoder/losses.py
@@ -162,6 +162,7 @@ class AngleProtoLoss(nn.Module):
         L = self.criterion(cos_sim_matrix, label)
         return L
 
+
 class SoftmaxLoss(nn.Module):
     """
     Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982
@@ -169,13 +170,14 @@ class SoftmaxLoss(nn.Module):
             - embedding_dim (float): speaker embedding dim
             - n_speakers (float): number of speakers
     """
+
     def __init__(self, embedding_dim, n_speakers):
         super().__init__()
 
         self.criterion = torch.nn.CrossEntropyLoss()
         self.fc = nn.Linear(embedding_dim, n_speakers)
 
-        print('Initialised Softmax Loss')
+        print("Initialised Softmax Loss")
 
     def forward(self, x, label=None):
         # reshape for compatibility
@@ -187,6 +189,7 @@ class SoftmaxLoss(nn.Module):
 
         return L
 
+
 class SoftmaxAngleProtoLoss(nn.Module):
     """
     Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
@@ -196,13 +199,14 @@ class SoftmaxAngleProtoLoss(nn.Module):
             - init_w (float): defines the initial value of w
             - init_b (float): definies the initial value of b
     """
+
     def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0):
         super().__init__()
 
         self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
         self.angleproto = AngleProtoLoss(init_w, init_b)
 
-        print('Initialised SoftmaxAnglePrototypical Loss')
+        print("Initialised SoftmaxAnglePrototypical Loss")
 
     def forward(self, x, label=None):
         """
@@ -213,4 +217,4 @@ class SoftmaxAngleProtoLoss(nn.Module):
 
         Ls = self.softmax(x, label)
 
-        return Ls+Lp
+        return Ls + Lp
diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py
index aa2171ed..ce86b01f 100644
--- a/TTS/speaker_encoder/models/resnet.py
+++ b/TTS/speaker_encoder/models/resnet.py
@@ -1,7 +1,8 @@
-import torch
 import numpy as np
+import torch
 import torch.nn as nn
 
+
 class SELayer(nn.Module):
     def __init__(self, channel, reduction=8):
         super(SELayer, self).__init__()
@@ -10,7 +11,7 @@ class SELayer(nn.Module):
             nn.Linear(channel, channel // reduction),
             nn.ReLU(inplace=True),
             nn.Linear(channel // reduction, channel),
-            nn.Sigmoid()
+            nn.Sigmoid(),
         )
 
     def forward(self, x):
@@ -19,6 +20,7 @@ class SELayer(nn.Module):
         y = self.fc(y).view(b, c, 1, 1)
         return x * y
 
+
 class SEBasicBlock(nn.Module):
     expansion = 1
 
@@ -51,12 +53,22 @@ class SEBasicBlock(nn.Module):
         out = self.relu(out)
         return out
 
+
 class ResNetSpeakerEncoder(nn.Module):
     """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
     Adapted from: https://github.com/clovaai/voxceleb_trainer
     """
+
     # pylint: disable=W0102
-    def __init__(self, input_dim=64, proj_dim=512, layers=[3, 4, 6, 3], num_filters=[32, 64, 128, 256], encoder_type='ASP', log_input=False):
+    def __init__(
+        self,
+        input_dim=64,
+        proj_dim=512,
+        layers=[3, 4, 6, 3],
+        num_filters=[32, 64, 128, 256],
+        encoder_type="ASP",
+        log_input=False,
+    ):
         super(ResNetSpeakerEncoder, self).__init__()
 
         self.encoder_type = encoder_type
@@ -74,7 +86,7 @@ class ResNetSpeakerEncoder(nn.Module):
 
         self.instancenorm = nn.InstanceNorm1d(input_dim)
 
-        outmap_size = int(self.input_dim/8)
+        outmap_size = int(self.input_dim / 8)
 
         self.attention = nn.Sequential(
             nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1),
@@ -82,14 +94,14 @@ class ResNetSpeakerEncoder(nn.Module):
             nn.BatchNorm1d(128),
             nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1),
             nn.Softmax(dim=2),
-            )
+        )
 
         if self.encoder_type == "SAP":
             out_dim = num_filters[3] * outmap_size
         elif self.encoder_type == "ASP":
             out_dim = num_filters[3] * outmap_size * 2
         else:
-            raise ValueError('Undefined encoder')
+            raise ValueError("Undefined encoder")
 
         self.fc = nn.Linear(out_dim, proj_dim)
 
@@ -98,7 +110,7 @@ class ResNetSpeakerEncoder(nn.Module):
     def _init_layers(self):
         for m in self.modules():
             if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
             elif isinstance(m, nn.BatchNorm2d):
                 nn.init.constant_(m.weight, 1)
                 nn.init.constant_(m.bias, 0)
@@ -107,8 +119,7 @@ class ResNetSpeakerEncoder(nn.Module):
         downsample = None
         if stride != 1 or self.inplanes != planes * block.expansion:
             downsample = nn.Sequential(
-                nn.Conv2d(self.inplanes, planes * block.expansion,
-                          kernel_size=1, stride=stride, bias=False),
+                nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
                 nn.BatchNorm2d(planes * block.expansion),
             )
 
@@ -131,7 +142,7 @@ class ResNetSpeakerEncoder(nn.Module):
         with torch.no_grad():
             with torch.cuda.amp.autocast(enabled=False):
                 if self.log_input:
-                    x = (x+1e-6).log()
+                    x = (x + 1e-6).log()
                 x = self.instancenorm(x).unsqueeze(1)
 
         x = self.conv1(x)
@@ -151,7 +162,7 @@ class ResNetSpeakerEncoder(nn.Module):
             x = torch.sum(x * w, dim=2)
         elif self.encoder_type == "ASP":
             mu = torch.sum(x * w, dim=2)
-            sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu ** 2).clamp(min=1e-5))
+            sg = torch.sqrt((torch.sum((x ** 2) * w, dim=2) - mu ** 2).clamp(min=1e-5))
             x = torch.cat((mu, sg), 1)
 
         x = x.view(x.size()[0], -1)
@@ -172,12 +183,12 @@ class ResNetSpeakerEncoder(nn.Module):
         if max_len < num_frames:
             num_frames = max_len
 
-        offsets = np.linspace(0, max_len-num_frames, num=num_eval)
+        offsets = np.linspace(0, max_len - num_frames, num=num_eval)
 
         frames_batch = []
         for offset in offsets:
             offset = int(offset)
-            end_offset = int(offset+num_frames)
+            end_offset = int(offset + num_frames)
             frames = x[:, offset:end_offset]
             frames_batch.append(frames)
 
diff --git a/TTS/speaker_encoder/speaker_encoder_config.py b/TTS/speaker_encoder/speaker_encoder_config.py
index 31149822..e830a0f5 100644
--- a/TTS/speaker_encoder/speaker_encoder_config.py
+++ b/TTS/speaker_encoder/speaker_encoder_config.py
@@ -25,10 +25,7 @@ class SpeakerEncoderConfig(BaseTrainingConfig):
         }
     )
 
-    audio_augmentation : dict = field(
-        default_factory=lambda: {
-        }
-    )
+    audio_augmentation: dict = field(default_factory=lambda: {})
 
     storage: dict = field(
         default_factory=lambda: {
diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/speaker_encoder/utils/generic_utils.py
index 3299f75a..fb61e48e 100644
--- a/TTS/speaker_encoder/utils/generic_utils.py
+++ b/TTS/speaker_encoder/utils/generic_utils.py
@@ -1,18 +1,18 @@
-import re
+import datetime
+import glob
 import os
+import random
+import re
+from multiprocessing import Manager
 
 import numpy as np
 import torch
-import glob
-import random
-import datetime
-
 from scipy import signal
-from multiprocessing import Manager
 
 from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder
 from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder
 
+
 class Storage(object):
     def __init__(self, maxsize, storage_batchs, num_speakers_in_batch, num_threads=8):
         # use multiprocessing for threading safe
@@ -53,19 +53,19 @@ class Storage(object):
         return self.storage[random.randint(0, storage_size)]
 
     def get_random_sample_fast(self):
-        '''Call this method only when storage is full'''
+        """Call this method only when storage is full"""
         return self.storage[random.randint(0, self.safe_storage_size)]
 
-class AugmentWAV(object):
 
+class AugmentWAV(object):
     def __init__(self, ap, augmentation_config):
 
         self.ap = ap
         self.use_additive_noise = False
 
-        if 'additive' in augmentation_config.keys():
-            self.additive_noise_config = augmentation_config['additive']
-            additive_path = self.additive_noise_config['sounds_path']
+        if "additive" in augmentation_config.keys():
+            self.additive_noise_config = augmentation_config["additive"]
+            additive_path = self.additive_noise_config["sounds_path"]
             if additive_path:
                 self.use_additive_noise = True
                 # get noise types
@@ -74,12 +74,12 @@ class AugmentWAV(object):
                     if isinstance(self.additive_noise_config[key], dict):
                         self.additive_noise_types.append(key)
 
-                additive_files = glob.glob(os.path.join(additive_path, '**/*.wav'), recursive=True)
+                additive_files = glob.glob(os.path.join(additive_path, "**/*.wav"), recursive=True)
 
                 self.noise_list = {}
 
                 for wav_file in additive_files:
-                    noise_dir = wav_file.replace(additive_path, '').split(os.sep)[0]
+                    noise_dir = wav_file.replace(additive_path, "").split(os.sep)[0]
                     # ignore not listed directories
                     if noise_dir not in self.additive_noise_types:
                         continue
@@ -87,14 +87,16 @@ class AugmentWAV(object):
                         self.noise_list[noise_dir] = []
                     self.noise_list[noise_dir].append(wav_file)
 
-                print(f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}")
+                print(
+                    f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}"
+                )
 
         self.use_rir = False
 
-        if 'rir' in augmentation_config.keys():
-            self.rir_config = augmentation_config['rir']
-            if self.rir_config['rir_path']:
-                self.rir_files = glob.glob(os.path.join(self.rir_config['rir_path'], '**/*.wav'), recursive=True)
+        if "rir" in augmentation_config.keys():
+            self.rir_config = augmentation_config["rir"]
+            if self.rir_config["rir_path"]:
+                self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True)
                 self.use_rir = True
 
             print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances")
@@ -111,9 +113,15 @@ class AugmentWAV(object):
 
     def additive_noise(self, noise_type, audio):
 
-        clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4)
+        clean_db = 10 * np.log10(np.mean(audio ** 2) + 1e-4)
 
-        noise_list = random.sample(self.noise_list[noise_type], random.randint(self.additive_noise_config[noise_type]['min_num_noises'], self.additive_noise_config[noise_type]['max_num_noises']))
+        noise_list = random.sample(
+            self.noise_list[noise_type],
+            random.randint(
+                self.additive_noise_config[noise_type]["min_num_noises"],
+                self.additive_noise_config[noise_type]["max_num_noises"],
+            ),
+        )
 
         audio_len = audio.shape[0]
         noises_wav = None
@@ -123,7 +131,10 @@ class AugmentWAV(object):
             if noiseaudio.shape[0] < audio_len:
                 continue
 
-            noise_snr = random.uniform(self.additive_noise_config[noise_type]['min_snr_in_db'], self.additive_noise_config[noise_type]['max_num_noises'])
+            noise_snr = random.uniform(
+                self.additive_noise_config[noise_type]["min_snr_in_db"],
+                self.additive_noise_config[noise_type]["max_num_noises"],
+            )
             noise_db = 10 * np.log10(np.mean(noiseaudio ** 2) + 1e-4)
             noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio
 
@@ -144,7 +155,7 @@ class AugmentWAV(object):
         rir_file = random.choice(self.rir_files)
         rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate)
         rir = rir / np.sqrt(np.sum(rir ** 2))
-        return signal.convolve(audio, rir, mode=self.rir_config['conv_mode'])[:audio_len]
+        return signal.convolve(audio, rir, mode=self.rir_config["conv_mode"])[:audio_len]
 
     def apply_one(self, audio):
         noise_type = random.choice(self.global_noise_list)
@@ -153,17 +164,25 @@ class AugmentWAV(object):
 
         return self.additive_noise(noise_type, audio)
 
+
 def to_camel(text):
     text = text.capitalize()
     return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
 
+
 def setup_model(c):
-    if c.model_params['model_name'].lower() == 'lstm':
-        model = LSTMSpeakerEncoder(c.model_params["input_dim"], c.model_params["proj_dim"], c.model_params["lstm_dim"], c.model_params["num_lstm_layers"])
-    elif c.model_params['model_name'].lower() == 'resnet':
+    if c.model_params["model_name"].lower() == "lstm":
+        model = LSTMSpeakerEncoder(
+            c.model_params["input_dim"],
+            c.model_params["proj_dim"],
+            c.model_params["lstm_dim"],
+            c.model_params["num_lstm_layers"],
+        )
+    elif c.model_params["model_name"].lower() == "resnet":
         model = ResNetSpeakerEncoder(input_dim=c.model_params["input_dim"], proj_dim=c.model_params["proj_dim"])
     return model
 
+
 def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
     checkpoint_path = "checkpoint_{}.pth.tar".format(current_step)
     checkpoint_path = os.path.join(out_path, checkpoint_path)
diff --git a/tests/test_speaker_encoder.py b/tests/test_speaker_encoder.py
index f56a9577..0bb07f37 100644
--- a/tests/test_speaker_encoder.py
+++ b/tests/test_speaker_encoder.py
@@ -6,6 +6,7 @@ from tests import get_tests_input_path
 from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
 from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder
 from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder
+
 file_path = get_tests_input_path()
 
 
@@ -39,6 +40,7 @@ class LSTMSpeakerEncoderTests(unittest.TestCase):
         assert output.shape[1] == 256
         assert len(output.shape) == 2
 
+
 class ResNetSpeakerEncoderTests(unittest.TestCase):
     # pylint: disable=R0201
     def test_in_out(self):
@@ -65,6 +67,7 @@ class ResNetSpeakerEncoderTests(unittest.TestCase):
         assert output.shape[1] == 256
         assert len(output.shape) == 2
 
+
 class GE2ELossTests(unittest.TestCase):
     # pylint: disable=R0201
     def test_in_out(self):
@@ -92,6 +95,7 @@ class GE2ELossTests(unittest.TestCase):
         output = loss.forward(dummy_input)
         assert output.item() < 0.005
 
+
 class AngleProtoLossTests(unittest.TestCase):
     # pylint: disable=R0201
     def test_in_out(self):
@@ -121,6 +125,7 @@ class AngleProtoLossTests(unittest.TestCase):
         output = loss.forward(dummy_input)
         assert output.item() < 0.005
 
+
 class SoftmaxAngleProtoLossTests(unittest.TestCase):
     # pylint: disable=R0201
     def test_in_out(self):
diff --git a/tests/test_speaker_encoder_train.py b/tests/test_speaker_encoder_train.py
index e168a785..21b12074 100644
--- a/tests/test_speaker_encoder_train.py
+++ b/tests/test_speaker_encoder_train.py
@@ -46,7 +46,7 @@ run_cli(command_train)
 shutil.rmtree(continue_path)
 
 # test resnet speaker encoder
-config.model_params['model_name'] = "resnet"
+config.model_params["model_name"] = "resnet"
 config.save_json(config_path)
 
 # train the model for one epoch

From 4726ae393d20313d2435d9cb59f3b0f04aa993eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 1 Jun 2021 09:14:16 +0200
Subject: [PATCH 17/36] pylint disable `not-callable` checks due to the
 warnings on torch layers

---
 .pylintrc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.pylintrc b/.pylintrc
index 1b3d64c2..34c121eb 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -158,7 +158,8 @@ disable=missing-docstring,
         deprecated-sys-function,
         exception-escape,
         comprehension-escape,
-        duplicate-code
+        duplicate-code,
+        not-callable
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option

From 406c4d057728222602baf05b8cb7ed824d09a04c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 1 Jun 2021 09:15:15 +0200
Subject: [PATCH 18/36] bump pylint version to 2.8.3

---
 requirements.dev.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.dev.txt b/requirements.dev.txt
index 144a0ed6..afb5ebe6 100644
--- a/requirements.dev.txt
+++ b/requirements.dev.txt
@@ -2,4 +2,4 @@ black
 coverage
 isort
 nose
-pylint==2.7.4
+pylint==2.8.3

From d0ab0382fc2edd9dcfc07aa6d4ffdf654adef451 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 1 Jun 2021 09:15:32 +0200
Subject: [PATCH 19/36] linter fixes

---
 TTS/bin/distribute.py | 2 +-
 TTS/utils/manage.py   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/TTS/bin/distribute.py b/TTS/bin/distribute.py
index 0bd27275..ea43f88b 100644
--- a/TTS/bin/distribute.py
+++ b/TTS/bin/distribute.py
@@ -51,7 +51,7 @@ def main():
         my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
         command[-1] = "--rank={}".format(i)
         stdout = None if i == 0 else open(os.devnull, "w")
-        p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env)
+        p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env)  # pylint: disable=consider-using-with
         processes.append(p)
         print(command)
 
diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index 2e3caa81..cf7df7de 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -149,8 +149,8 @@ class ModelManager(object):
     def _download_zip_file(file_url, output):
         """Download the github releases"""
         r = requests.get(file_url)
-        z = zipfile.ZipFile(io.BytesIO(r.content))
-        z.extractall(output)
+        with zipfile.ZipFile(io.BytesIO(r.content)) as z:
+            z.extractall(output)
         for file_path in z.namelist()[1:]:
             src_path = os.path.join(output, file_path)
             dst_path = os.path.join(output, os.path.basename(file_path))

From 5b89ef2c6e5895b168f8f150ddce345dcee6be91 Mon Sep 17 00:00:00 2001
From: Alexander Korolev <SanjaESC@users.noreply.github.com>
Date: Tue, 1 Jun 2021 11:06:35 +0200
Subject: [PATCH 20/36] fix speaker-embeddings dimension during inference

---
 TTS/tts/models/tacotron2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py
index fded8f87..a5db64e9 100644
--- a/TTS/tts/models/tacotron2.py
+++ b/TTS/tts/models/tacotron2.py
@@ -277,6 +277,7 @@ class Tacotron2(TacotronAbstract):
         if self.num_speakers > 1:
             if not self.embeddings_per_sample:
                 speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
+                speaker_embeddings = torch.unsqueeze(speaker_embeddings, 0).transpose(1, 2)
             encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
 
         mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(encoder_outputs)

From 1cc18d19729545c83e2a7482b949f896fd714ef4 Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Tue, 1 Jun 2021 18:51:34 +0900
Subject: [PATCH 21/36] Move unittest of Japanese phonemizer.

---
 .../tts_tests/test_japanese_phonemizer.py                       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename TTS/tts/utils/text/japanese/phonemizer_test.py => tests/tts_tests/test_japanese_phonemizer.py (89%)

diff --git a/TTS/tts/utils/text/japanese/phonemizer_test.py b/tests/tts_tests/test_japanese_phonemizer.py
similarity index 89%
rename from TTS/tts/utils/text/japanese/phonemizer_test.py
rename to tests/tts_tests/test_japanese_phonemizer.py
index f07c0901..437042f0 100644
--- a/TTS/tts/utils/text/japanese/phonemizer_test.py
+++ b/tests/tts_tests/test_japanese_phonemizer.py
@@ -1,5 +1,5 @@
 import unittest
-from .phonemizer import japanese_text_to_phonemes
+from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
 
 _TEST_CASES = '''
 どちらに行きますか？/dochiraniikimasuka?

From c1eb9bdccacfb09356282557bae21885ecaa0dfa Mon Sep 17 00:00:00 2001
From: Alexander Korolev <SanjaESC@users.noreply.github.com>
Date: Tue, 1 Jun 2021 15:15:26 +0200
Subject: [PATCH 22/36] fix speaker dim inference

---
 TTS/tts/models/tacotron2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py
index a5db64e9..525eb8b3 100644
--- a/TTS/tts/models/tacotron2.py
+++ b/TTS/tts/models/tacotron2.py
@@ -255,6 +255,7 @@ class Tacotron2(TacotronAbstract):
         if self.num_speakers > 1:
             if not self.embeddings_per_sample:
                 speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
+                speaker_embeddings = torch.unsqueeze(speaker_embeddings, 0).transpose(1, 2)
             encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
 
         decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs)

From 6d8310d2a99de22e3537321acbf48f9b35b00b14 Mon Sep 17 00:00:00 2001
From: Katsuya Iida <katsuya.iida@gmail.com>
Date: Wed, 2 Jun 2021 07:48:28 +0900
Subject: [PATCH 23/36] Set the version to the same with the dev branch.

---
 TTS/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/_version.py b/TTS/_version.py
index f4956698..311f216e 100644
--- a/TTS/_version.py
+++ b/TTS/_version.py
@@ -1 +1 @@
-__version__ = "0.0.14.1"
+__version__ = "0.0.14"

From 0c14278c306fa52408b487b10e75b6aa26525f91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 2 Jun 2021 11:40:26 +0200
Subject: [PATCH 24/36] reorg test files

---
 .../test_dataset_formatters.py}                                   | 0
 tests/{ => data_tests}/test_loader.py                             | 0
 tests/{ => inference_tests}/test_synthesize.py                    | 0
 tests/{ => inference_tests}/test_synthesizer.py                   | 0
 tests/{test_audio.py => test_audio_processor.py}                  | 0
 tests/{tts_tests => text_tests}/test_japanese_phonemizer.py       | 0
 tests/{ => text_tests}/test_symbols.py                            | 0
 tests/{ => text_tests}/test_text_cleaners.py                      | 0
 tests/{ => tts_tests}/test_feed_forward_layers.py                 | 0
 tests/{ => tts_tests}/test_glow_tts.py                            | 0
 tests/{ => tts_tests}/test_speedy_speech_layers.py                | 0
 tests/{ => tts_tests}/test_tacotron2_model.py                     | 0
 tests/{ => tts_tests}/test_tacotron2_tf_model.py                  | 0
 tests/{test_layers.py => tts_tests/test_tacotron_layers.py}       | 0
 tests/{ => tts_tests}/test_tacotron_model.py                      | 0
 15 files changed, 0 insertions(+), 0 deletions(-)
 rename tests/{test_preprocessors.py => data_tests/test_dataset_formatters.py} (100%)
 rename tests/{ => data_tests}/test_loader.py (100%)
 rename tests/{ => inference_tests}/test_synthesize.py (100%)
 rename tests/{ => inference_tests}/test_synthesizer.py (100%)
 rename tests/{test_audio.py => test_audio_processor.py} (100%)
 rename tests/{tts_tests => text_tests}/test_japanese_phonemizer.py (100%)
 rename tests/{ => text_tests}/test_symbols.py (100%)
 rename tests/{ => text_tests}/test_text_cleaners.py (100%)
 rename tests/{ => tts_tests}/test_feed_forward_layers.py (100%)
 rename tests/{ => tts_tests}/test_glow_tts.py (100%)
 rename tests/{ => tts_tests}/test_speedy_speech_layers.py (100%)
 rename tests/{ => tts_tests}/test_tacotron2_model.py (100%)
 rename tests/{ => tts_tests}/test_tacotron2_tf_model.py (100%)
 rename tests/{test_layers.py => tts_tests/test_tacotron_layers.py} (100%)
 rename tests/{ => tts_tests}/test_tacotron_model.py (100%)

diff --git a/tests/test_preprocessors.py b/tests/data_tests/test_dataset_formatters.py
similarity index 100%
rename from tests/test_preprocessors.py
rename to tests/data_tests/test_dataset_formatters.py
diff --git a/tests/test_loader.py b/tests/data_tests/test_loader.py
similarity index 100%
rename from tests/test_loader.py
rename to tests/data_tests/test_loader.py
diff --git a/tests/test_synthesize.py b/tests/inference_tests/test_synthesize.py
similarity index 100%
rename from tests/test_synthesize.py
rename to tests/inference_tests/test_synthesize.py
diff --git a/tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py
similarity index 100%
rename from tests/test_synthesizer.py
rename to tests/inference_tests/test_synthesizer.py
diff --git a/tests/test_audio.py b/tests/test_audio_processor.py
similarity index 100%
rename from tests/test_audio.py
rename to tests/test_audio_processor.py
diff --git a/tests/tts_tests/test_japanese_phonemizer.py b/tests/text_tests/test_japanese_phonemizer.py
similarity index 100%
rename from tests/tts_tests/test_japanese_phonemizer.py
rename to tests/text_tests/test_japanese_phonemizer.py
diff --git a/tests/test_symbols.py b/tests/text_tests/test_symbols.py
similarity index 100%
rename from tests/test_symbols.py
rename to tests/text_tests/test_symbols.py
diff --git a/tests/test_text_cleaners.py b/tests/text_tests/test_text_cleaners.py
similarity index 100%
rename from tests/test_text_cleaners.py
rename to tests/text_tests/test_text_cleaners.py
diff --git a/tests/test_feed_forward_layers.py b/tests/tts_tests/test_feed_forward_layers.py
similarity index 100%
rename from tests/test_feed_forward_layers.py
rename to tests/tts_tests/test_feed_forward_layers.py
diff --git a/tests/test_glow_tts.py b/tests/tts_tests/test_glow_tts.py
similarity index 100%
rename from tests/test_glow_tts.py
rename to tests/tts_tests/test_glow_tts.py
diff --git a/tests/test_speedy_speech_layers.py b/tests/tts_tests/test_speedy_speech_layers.py
similarity index 100%
rename from tests/test_speedy_speech_layers.py
rename to tests/tts_tests/test_speedy_speech_layers.py
diff --git a/tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py
similarity index 100%
rename from tests/test_tacotron2_model.py
rename to tests/tts_tests/test_tacotron2_model.py
diff --git a/tests/test_tacotron2_tf_model.py b/tests/tts_tests/test_tacotron2_tf_model.py
similarity index 100%
rename from tests/test_tacotron2_tf_model.py
rename to tests/tts_tests/test_tacotron2_tf_model.py
diff --git a/tests/test_layers.py b/tests/tts_tests/test_tacotron_layers.py
similarity index 100%
rename from tests/test_layers.py
rename to tests/tts_tests/test_tacotron_layers.py
diff --git a/tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py
similarity index 100%
rename from tests/test_tacotron_model.py
rename to tests/tts_tests/test_tacotron_model.py

From 49c5e5d820e2413acb80c3b9004e5a5243d44f41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 2 Jun 2021 11:42:38 +0200
Subject: [PATCH 25/36] maket style japanese PR

---
 TTS/tts/datasets/preprocess.py               |   2 +-
 TTS/tts/utils/text/japanese/phonemizer.py    | 637 ++++++++++---------
 tests/text_tests/test_japanese_phonemizer.py |  14 +-
 3 files changed, 328 insertions(+), 325 deletions(-)

diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py
index 271b1734..62cb9fef 100644
--- a/TTS/tts/datasets/preprocess.py
+++ b/TTS/tts/datasets/preprocess.py
@@ -434,7 +434,7 @@ def kokoro(root_path, meta_file):
     with open(txt_file, "r") as ttf:
         for line in ttf:
             cols = line.split("|")
-            wav_file = os.path.join(root_path, "wavs", cols[0] + '.wav')
+            wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
             text = cols[2].replace(" ", "")
             items.append([text, wav_file, speaker_name])
     return items
diff --git a/TTS/tts/utils/text/japanese/phonemizer.py b/TTS/tts/utils/text/japanese/phonemizer.py
index f09d5b05..a4629a30 100644
--- a/TTS/tts/utils/text/japanese/phonemizer.py
+++ b/TTS/tts/utils/text/japanese/phonemizer.py
@@ -2,324 +2,321 @@
 # compatible with Julius https://github.com/julius-speech/segmentation-kit
 
 import re
+
 import MeCab
 
 _CONVRULES = [
     # Conversion of 2 letters
-    'アァ/ a a',
-    'イィ/ i i',
-    'イェ/ i e',
-    'イャ/ y a',
-    'ウゥ/ u:',
-    'エェ/ e e',
-    'オォ/ o:',
-    'カァ/ k a:',
-    'キィ/ k i:',
-    'クゥ/ k u:',
-    'クャ/ ky a',
-    'クュ/ ky u',
-    'クョ/ ky o',
-    'ケェ/ k e:',
-    'コォ/ k o:',
-    'ガァ/ g a:',
-    'ギィ/ g i:',
-    'グゥ/ g u:',
-    'グャ/ gy a',
-    'グュ/ gy u',
-    'グョ/ gy o',
-    'ゲェ/ g e:',
-    'ゴォ/ g o:',
-    'サァ/ s a:',
-    'シィ/ sh i:',
-    'スゥ/ s u:',
-    'スャ/ sh a',
-    'スュ/ sh u',
-    'スョ/ sh o',
-    'セェ/ s e:',
-    'ソォ/ s o:',
-    'ザァ/ z a:',
-    'ジィ/ j i:',
-    'ズゥ/ z u:',
-    'ズャ/ zy a',
-    'ズュ/ zy u',
-    'ズョ/ zy o',
-    'ゼェ/ z e:',
-    'ゾォ/ z o:',
-    'タァ/ t a:',
-    'チィ/ ch i:',
-    'ツァ/ ts a',
-    'ツィ/ ts i',
-    'ツゥ/ ts u:',
-    'ツャ/ ch a',
-    'ツュ/ ch u',
-    'ツョ/ ch o',
-    'ツェ/ ts e',
-    'ツォ/ ts o',
-    'テェ/ t e:',
-    'トォ/ t o:',
-    'ダァ/ d a:',
-    'ヂィ/ j i:',
-    'ヅゥ/ d u:',
-    'ヅャ/ zy a',
-    'ヅュ/ zy u',
-    'ヅョ/ zy o',
-    'デェ/ d e:',
-    'ドォ/ d o:',
-    'ナァ/ n a:',
-    'ニィ/ n i:',
-    'ヌゥ/ n u:',
-    'ヌャ/ ny a',
-    'ヌュ/ ny u',
-    'ヌョ/ ny o',
-    'ネェ/ n e:',
-    'ノォ/ n o:',
-    'ハァ/ h a:',
-    'ヒィ/ h i:',
-    'フゥ/ f u:',
-    'フャ/ hy a',
-    'フュ/ hy u',
-    'フョ/ hy o',
-    'ヘェ/ h e:',
-    'ホォ/ h o:',
-    'バァ/ b a:',
-    'ビィ/ b i:',
-    'ブゥ/ b u:',
-    'フャ/ hy a',
-    'ブュ/ by u',
-    'フョ/ hy o',
-    'ベェ/ b e:',
-    'ボォ/ b o:',
-    'パァ/ p a:',
-    'ピィ/ p i:',
-    'プゥ/ p u:',
-    'プャ/ py a',
-    'プュ/ py u',
-    'プョ/ py o',
-    'ペェ/ p e:',
-    'ポォ/ p o:',
-    'マァ/ m a:',
-    'ミィ/ m i:',
-    'ムゥ/ m u:',
-    'ムャ/ my a',
-    'ムュ/ my u',
-    'ムョ/ my o',
-    'メェ/ m e:',
-    'モォ/ m o:',
-    'ヤァ/ y a:',
-    'ユゥ/ y u:',
-    'ユャ/ y a:',
-    'ユュ/ y u:',
-    'ユョ/ y o:',
-    'ヨォ/ y o:',
-    'ラァ/ r a:',
-    'リィ/ r i:',
-    'ルゥ/ r u:',
-    'ルャ/ ry a',
-    'ルュ/ ry u',
-    'ルョ/ ry o',
-    'レェ/ r e:',
-    'ロォ/ r o:',
-    'ワァ/ w a:',
-    'ヲォ/ o:',
-    'ディ/ d i',
-    'デェ/ d e:',
-    'デャ/ dy a',
-    'デュ/ dy u',
-    'デョ/ dy o',
-    'ティ/ t i',
-    'テェ/ t e:',
-    'テャ/ ty a',
-    'テュ/ ty u',
-    'テョ/ ty o',
-    'スィ/ s i',
-    'ズァ/ z u a',
-    'ズィ/ z i',
-    'ズゥ/ z u',
-    'ズャ/ zy a',
-    'ズュ/ zy u',
-    'ズョ/ zy o',
-    'ズェ/ z e',
-    'ズォ/ z o',
-    'キャ/ ky a',
-    'キュ/ ky u',
-    'キョ/ ky o',
-    'シャ/ sh a',
-    'シュ/ sh u',
-    'シェ/ sh e',
-    'ショ/ sh o',
-    'チャ/ ch a',
-    'チュ/ ch u',
-    'チェ/ ch e',
-    'チョ/ ch o',
-    'トゥ/ t u',
-    'トャ/ ty a',
-    'トュ/ ty u',
-    'トョ/ ty o',
-    'ドァ/ d o a',
-    'ドゥ/ d u',
-    'ドャ/ dy a',
-    'ドュ/ dy u',
-    'ドョ/ dy o',
-    'ドォ/ d o:',
-    'ニャ/ ny a',
-    'ニュ/ ny u',
-    'ニョ/ ny o',
-    'ヒャ/ hy a',
-    'ヒュ/ hy u',
-    'ヒョ/ hy o',
-    'ミャ/ my a',
-    'ミュ/ my u',
-    'ミョ/ my o',
-    'リャ/ ry a',
-    'リュ/ ry u',
-    'リョ/ ry o',
-    'ギャ/ gy a',
-    'ギュ/ gy u',
-    'ギョ/ gy o',
-    'ヂェ/ j e',
-    'ヂャ/ j a',
-    'ヂュ/ j u',
-    'ヂョ/ j o',
-    'ジェ/ j e',
-    'ジャ/ j a',
-    'ジュ/ j u',
-    'ジョ/ j o',
-    'ビャ/ by a',
-    'ビュ/ by u',
-    'ビョ/ by o',
-    'ピャ/ py a',
-    'ピュ/ py u',
-    'ピョ/ py o',
-    'ウァ/ u a',
-    'ウィ/ w i',
-    'ウェ/ w e',
-    'ウォ/ w o',
-    'ファ/ f a',
-    'フィ/ f i',
-    'フゥ/ f u',
-    'フャ/ hy a',
-    'フュ/ hy u',
-    'フョ/ hy o',
-    'フェ/ f e',
-    'フォ/ f o',
-    'ヴァ/ b a',
-    'ヴィ/ b i',
-    'ヴェ/ b e',
-    'ヴォ/ b o',
-    'ヴュ/ by u',
-
+    "アァ/ a a",
+    "イィ/ i i",
+    "イェ/ i e",
+    "イャ/ y a",
+    "ウゥ/ u:",
+    "エェ/ e e",
+    "オォ/ o:",
+    "カァ/ k a:",
+    "キィ/ k i:",
+    "クゥ/ k u:",
+    "クャ/ ky a",
+    "クュ/ ky u",
+    "クョ/ ky o",
+    "ケェ/ k e:",
+    "コォ/ k o:",
+    "ガァ/ g a:",
+    "ギィ/ g i:",
+    "グゥ/ g u:",
+    "グャ/ gy a",
+    "グュ/ gy u",
+    "グョ/ gy o",
+    "ゲェ/ g e:",
+    "ゴォ/ g o:",
+    "サァ/ s a:",
+    "シィ/ sh i:",
+    "スゥ/ s u:",
+    "スャ/ sh a",
+    "スュ/ sh u",
+    "スョ/ sh o",
+    "セェ/ s e:",
+    "ソォ/ s o:",
+    "ザァ/ z a:",
+    "ジィ/ j i:",
+    "ズゥ/ z u:",
+    "ズャ/ zy a",
+    "ズュ/ zy u",
+    "ズョ/ zy o",
+    "ゼェ/ z e:",
+    "ゾォ/ z o:",
+    "タァ/ t a:",
+    "チィ/ ch i:",
+    "ツァ/ ts a",
+    "ツィ/ ts i",
+    "ツゥ/ ts u:",
+    "ツャ/ ch a",
+    "ツュ/ ch u",
+    "ツョ/ ch o",
+    "ツェ/ ts e",
+    "ツォ/ ts o",
+    "テェ/ t e:",
+    "トォ/ t o:",
+    "ダァ/ d a:",
+    "ヂィ/ j i:",
+    "ヅゥ/ d u:",
+    "ヅャ/ zy a",
+    "ヅュ/ zy u",
+    "ヅョ/ zy o",
+    "デェ/ d e:",
+    "ドォ/ d o:",
+    "ナァ/ n a:",
+    "ニィ/ n i:",
+    "ヌゥ/ n u:",
+    "ヌャ/ ny a",
+    "ヌュ/ ny u",
+    "ヌョ/ ny o",
+    "ネェ/ n e:",
+    "ノォ/ n o:",
+    "ハァ/ h a:",
+    "ヒィ/ h i:",
+    "フゥ/ f u:",
+    "フャ/ hy a",
+    "フュ/ hy u",
+    "フョ/ hy o",
+    "ヘェ/ h e:",
+    "ホォ/ h o:",
+    "バァ/ b a:",
+    "ビィ/ b i:",
+    "ブゥ/ b u:",
+    "フャ/ hy a",
+    "ブュ/ by u",
+    "フョ/ hy o",
+    "ベェ/ b e:",
+    "ボォ/ b o:",
+    "パァ/ p a:",
+    "ピィ/ p i:",
+    "プゥ/ p u:",
+    "プャ/ py a",
+    "プュ/ py u",
+    "プョ/ py o",
+    "ペェ/ p e:",
+    "ポォ/ p o:",
+    "マァ/ m a:",
+    "ミィ/ m i:",
+    "ムゥ/ m u:",
+    "ムャ/ my a",
+    "ムュ/ my u",
+    "ムョ/ my o",
+    "メェ/ m e:",
+    "モォ/ m o:",
+    "ヤァ/ y a:",
+    "ユゥ/ y u:",
+    "ユャ/ y a:",
+    "ユュ/ y u:",
+    "ユョ/ y o:",
+    "ヨォ/ y o:",
+    "ラァ/ r a:",
+    "リィ/ r i:",
+    "ルゥ/ r u:",
+    "ルャ/ ry a",
+    "ルュ/ ry u",
+    "ルョ/ ry o",
+    "レェ/ r e:",
+    "ロォ/ r o:",
+    "ワァ/ w a:",
+    "ヲォ/ o:",
+    "ディ/ d i",
+    "デェ/ d e:",
+    "デャ/ dy a",
+    "デュ/ dy u",
+    "デョ/ dy o",
+    "ティ/ t i",
+    "テェ/ t e:",
+    "テャ/ ty a",
+    "テュ/ ty u",
+    "テョ/ ty o",
+    "スィ/ s i",
+    "ズァ/ z u a",
+    "ズィ/ z i",
+    "ズゥ/ z u",
+    "ズャ/ zy a",
+    "ズュ/ zy u",
+    "ズョ/ zy o",
+    "ズェ/ z e",
+    "ズォ/ z o",
+    "キャ/ ky a",
+    "キュ/ ky u",
+    "キョ/ ky o",
+    "シャ/ sh a",
+    "シュ/ sh u",
+    "シェ/ sh e",
+    "ショ/ sh o",
+    "チャ/ ch a",
+    "チュ/ ch u",
+    "チェ/ ch e",
+    "チョ/ ch o",
+    "トゥ/ t u",
+    "トャ/ ty a",
+    "トュ/ ty u",
+    "トョ/ ty o",
+    "ドァ/ d o a",
+    "ドゥ/ d u",
+    "ドャ/ dy a",
+    "ドュ/ dy u",
+    "ドョ/ dy o",
+    "ドォ/ d o:",
+    "ニャ/ ny a",
+    "ニュ/ ny u",
+    "ニョ/ ny o",
+    "ヒャ/ hy a",
+    "ヒュ/ hy u",
+    "ヒョ/ hy o",
+    "ミャ/ my a",
+    "ミュ/ my u",
+    "ミョ/ my o",
+    "リャ/ ry a",
+    "リュ/ ry u",
+    "リョ/ ry o",
+    "ギャ/ gy a",
+    "ギュ/ gy u",
+    "ギョ/ gy o",
+    "ヂェ/ j e",
+    "ヂャ/ j a",
+    "ヂュ/ j u",
+    "ヂョ/ j o",
+    "ジェ/ j e",
+    "ジャ/ j a",
+    "ジュ/ j u",
+    "ジョ/ j o",
+    "ビャ/ by a",
+    "ビュ/ by u",
+    "ビョ/ by o",
+    "ピャ/ py a",
+    "ピュ/ py u",
+    "ピョ/ py o",
+    "ウァ/ u a",
+    "ウィ/ w i",
+    "ウェ/ w e",
+    "ウォ/ w o",
+    "ファ/ f a",
+    "フィ/ f i",
+    "フゥ/ f u",
+    "フャ/ hy a",
+    "フュ/ hy u",
+    "フョ/ hy o",
+    "フェ/ f e",
+    "フォ/ f o",
+    "ヴァ/ b a",
+    "ヴィ/ b i",
+    "ヴェ/ b e",
+    "ヴォ/ b o",
+    "ヴュ/ by u",
     # Conversion of 1 letter
-    'ア/ a',
-    'イ/ i',
-    'ウ/ u',
-    'エ/ e',
-    'オ/ o',
-    'カ/ k a',
-    'キ/ k i',
-    'ク/ k u',
-    'ケ/ k e',
-    'コ/ k o',
-    'サ/ s a',
-    'シ/ sh i',
-    'ス/ s u',
-    'セ/ s e',
-    'ソ/ s o',
-    'タ/ t a',
-    'チ/ ch i',
-    'ツ/ ts u',
-    'テ/ t e',
-    'ト/ t o',
-    'ナ/ n a',
-    'ニ/ n i',
-    'ヌ/ n u',
-    'ネ/ n e',
-    'ノ/ n o',
-    'ハ/ h a',
-    'ヒ/ h i',
-    'フ/ f u',
-    'ヘ/ h e',
-    'ホ/ h o',
-    'マ/ m a',
-    'ミ/ m i',
-    'ム/ m u',
-    'メ/ m e',
-    'モ/ m o',
-    'ラ/ r a',
-    'リ/ r i',
-    'ル/ r u',
-    'レ/ r e',
-    'ロ/ r o',
-    'ガ/ g a',
-    'ギ/ g i',
-    'グ/ g u',
-    'ゲ/ g e',
-    'ゴ/ g o',
-    'ザ/ z a',
-    'ジ/ j i',
-    'ズ/ z u',
-    'ゼ/ z e',
-    'ゾ/ z o',
-    'ダ/ d a',
-    'ヂ/ j i',
-    'ヅ/ z u',
-    'デ/ d e',
-    'ド/ d o',
-    'バ/ b a',
-    'ビ/ b i',
-    'ブ/ b u',
-    'ベ/ b e',
-    'ボ/ b o',
-    'パ/ p a',
-    'ピ/ p i',
-    'プ/ p u',
-    'ペ/ p e',
-    'ポ/ p o',
-    'ヤ/ y a',
-    'ユ/ y u',
-    'ヨ/ y o',
-    'ワ/ w a',
-    'ヰ/ i',
-    'ヱ/ e',
-    'ヲ/ o',
-    'ン/ N',
-    'ッ/ q',
-    'ヴ/ b u',
-    'ー/:',
-
+    "ア/ a",
+    "イ/ i",
+    "ウ/ u",
+    "エ/ e",
+    "オ/ o",
+    "カ/ k a",
+    "キ/ k i",
+    "ク/ k u",
+    "ケ/ k e",
+    "コ/ k o",
+    "サ/ s a",
+    "シ/ sh i",
+    "ス/ s u",
+    "セ/ s e",
+    "ソ/ s o",
+    "タ/ t a",
+    "チ/ ch i",
+    "ツ/ ts u",
+    "テ/ t e",
+    "ト/ t o",
+    "ナ/ n a",
+    "ニ/ n i",
+    "ヌ/ n u",
+    "ネ/ n e",
+    "ノ/ n o",
+    "ハ/ h a",
+    "ヒ/ h i",
+    "フ/ f u",
+    "ヘ/ h e",
+    "ホ/ h o",
+    "マ/ m a",
+    "ミ/ m i",
+    "ム/ m u",
+    "メ/ m e",
+    "モ/ m o",
+    "ラ/ r a",
+    "リ/ r i",
+    "ル/ r u",
+    "レ/ r e",
+    "ロ/ r o",
+    "ガ/ g a",
+    "ギ/ g i",
+    "グ/ g u",
+    "ゲ/ g e",
+    "ゴ/ g o",
+    "ザ/ z a",
+    "ジ/ j i",
+    "ズ/ z u",
+    "ゼ/ z e",
+    "ゾ/ z o",
+    "ダ/ d a",
+    "ヂ/ j i",
+    "ヅ/ z u",
+    "デ/ d e",
+    "ド/ d o",
+    "バ/ b a",
+    "ビ/ b i",
+    "ブ/ b u",
+    "ベ/ b e",
+    "ボ/ b o",
+    "パ/ p a",
+    "ピ/ p i",
+    "プ/ p u",
+    "ペ/ p e",
+    "ポ/ p o",
+    "ヤ/ y a",
+    "ユ/ y u",
+    "ヨ/ y o",
+    "ワ/ w a",
+    "ヰ/ i",
+    "ヱ/ e",
+    "ヲ/ o",
+    "ン/ N",
+    "ッ/ q",
+    "ヴ/ b u",
+    "ー/:",
     # Try converting broken text
-    'ァ/ a',
-    'ィ/ i',
-    'ゥ/ u',
-    'ェ/ e',
-    'ォ/ o',
-    'ヮ/ w a',
-    'ォ/ o',
-
+    "ァ/ a",
+    "ィ/ i",
+    "ゥ/ u",
+    "ェ/ e",
+    "ォ/ o",
+    "ヮ/ w a",
+    "ォ/ o",
     # Symbols
-    '、/ ,',
-    '。/ .',
-    '！/ !',
-    '？/ ?',
-    '・/ ,'
+    "、/ ,",
+    "。/ .",
+    "！/ !",
+    "？/ ?",
+    "・/ ,",
 ]
 
-_COLON_RX = re.compile(':+')
-_REJECT_RX = re.compile('[^ a-zA-Z:,.?]')
+_COLON_RX = re.compile(":+")
+_REJECT_RX = re.compile("[^ a-zA-Z:,.?]")
+
 
 def _makerulemap():
-    l = [tuple(x.split('/')) for x in _CONVRULES]
-    return tuple(
-        {k: v for k, v in l if len(k) == i}
-        for i in (1, 2)
-    )
+    l = [tuple(x.split("/")) for x in _CONVRULES]
+    return tuple({k: v for k, v in l if len(k) == i} for i in (1, 2))
+
 
 _RULEMAP1, _RULEMAP2 = _makerulemap()
 
+
 def kata2phoneme(text: str) -> str:
-    """Convert katakana text to phonemes.
-    """
+    """Convert katakana text to phonemes."""
     text = text.strip()
-    res = ''
+    res = ""
     while text:
         if len(text) >= 2:
             x = _RULEMAP2.get(text[:2])
@@ -332,30 +329,34 @@ def kata2phoneme(text: str) -> str:
             text = text[1:]
             res += x
             continue
-        res += ' ' + text[0]
+        res += " " + text[0]
         text = text[1:]
-    res = _COLON_RX.sub(':', res)
+    res = _COLON_RX.sub(":", res)
     return res[1:]
 
-_KATAKANA = ''.join(chr(ch) for ch in range(ord('ァ'), ord('ン') + 1))
-_HIRAGANA = ''.join(chr(ch) for ch in range(ord('ぁ'), ord('ん') + 1))
+
+_KATAKANA = "".join(chr(ch) for ch in range(ord("ァ"), ord("ン") + 1))
+_HIRAGANA = "".join(chr(ch) for ch in range(ord("ぁ"), ord("ん") + 1))
 _HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA)
 
+
 def hira2kata(text: str) -> str:
     text = text.translate(_HIRA2KATATRANS)
-    return text.replace('う゛', 'ヴ')
+    return text.replace("う゛", "ヴ")
 
-_SYMBOL_TOKENS = set(list('・、。？！'))
-_NO_YOMI_TOKENS = set(list('「」『』―（）［］[]　…'))
+
+_SYMBOL_TOKENS = set(list("・、。？！"))
+_NO_YOMI_TOKENS = set(list("「」『』―（）［］[]　…"))
 _TAGGER = MeCab.Tagger()
 
+
 def text2kata(text: str) -> str:
     parsed = _TAGGER.parse(text)
     res = []
-    for line in parsed.split('\n'):
-        if line == 'EOS':
+    for line in parsed.split("\n"):
+        if line == "EOS":
             break
-        parts = line.split('\t')
+        parts = line.split("\t")
 
         word, yomi = parts[0], parts[1]
         if yomi:
@@ -363,17 +364,17 @@ def text2kata(text: str) -> str:
         else:
             if word in _SYMBOL_TOKENS:
                 res.append(word)
-            elif word in ('っ', 'ッ'):
-                res.append('ッ')
+            elif word in ("っ", "ッ"):
+                res.append("ッ")
             elif word in _NO_YOMI_TOKENS:
                 pass
             else:
                 res.append(word)
-    return hira2kata(''.join(res))
+    return hira2kata("".join(res))
+
 
 def japanese_text_to_phonemes(text: str) -> str:
-    """Convert Japanese text to phonemes.
-    """
+    """Convert Japanese text to phonemes."""
     res = text2kata(text)
     res = kata2phoneme(res)
-    return res.replace(' ', '')
+    return res.replace(" ", "")
diff --git a/tests/text_tests/test_japanese_phonemizer.py b/tests/text_tests/test_japanese_phonemizer.py
index 437042f0..b3b1ece3 100644
--- a/tests/text_tests/test_japanese_phonemizer.py
+++ b/tests/text_tests/test_japanese_phonemizer.py
@@ -1,7 +1,8 @@
 import unittest
+
 from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
 
-_TEST_CASES = '''
+_TEST_CASES = """
 どちらに行きますか？/dochiraniikimasuka?
 今日は温泉に、行きます。/kyo:waoNseNni,ikimasu.
 「A」から「Z」までです。/AkaraZmadedesu.
@@ -9,14 +10,15 @@ _TEST_CASES = '''
 クジラは哺乳類です。/kujirawahonyu:ruidesu.
 ヴィディオを見ます。/bidioomimasu.
 ky o: w a o N s e N n i , i k i m a s u ./kyo:waoNseNni,ikimasu.
-'''
+"""
+
 
 class TestText(unittest.TestCase):
-
     def test_japanese_text_to_phonemes(self):
-        for line in _TEST_CASES.strip().split('\n'):
-            text, phone = line.split('/')
+        for line in _TEST_CASES.strip().split("\n"):
+            text, phone = line.split("/")
             self.assertEqual(japanese_text_to_phonemes(text), phone)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     unittest.main()

From 4baa59d73214d4871b4aa2262e28d0ba3f149d5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 2 Jun 2021 11:42:56 +0200
Subject: [PATCH 26/36] comment `requirements.txt` for japanese deps

---
 requirements.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ab828503..fde48978 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,5 +19,6 @@ numba==0.52
 umap-learn==0.4.6
 anyascii
 coqpit
-mecab-python3
-unidic-lite
+# japanese g2p deps
+mecab-python3==1.0.3
+unidic-lite==1.0.8

From db48c69f0f71139ba5cc6f2fe59c2165b6873fbf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 2 Jun 2021 11:43:27 +0200
Subject: [PATCH 27/36] reduce fullband melgan model size for testing

---
 tests/vocoder_tests/test_fullband_melgan_train.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py
index d9bc51ac..2b286b91 100644
--- a/tests/vocoder_tests/test_fullband_melgan_train.py
+++ b/tests/vocoder_tests/test_fullband_melgan_train.py
@@ -20,6 +20,7 @@ config = FullbandMelganConfig(
     eval_split_size=1,
     print_step=1,
     print_eval=True,
+    discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]},
     data_path="tests/data/ljspeech",
     output_path=output_path,
 )

From 401fbd8978862346f35e8b0d0206a2c8c8abd75b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 2 Jun 2021 11:48:17 +0200
Subject: [PATCH 28/36] bump up to v0.0.15

---
 TTS/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/_version.py b/TTS/_version.py
index 311f216e..6561790f 100644
--- a/TTS/_version.py
+++ b/TTS/_version.py
@@ -1 +1 @@
-__version__ = "0.0.14"
+__version__ = "0.0.15"

From bd434636a9774e285db046569707c5846ba8ba2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 2 Jun 2021 15:54:37 +0200
Subject: [PATCH 29/36] new japanese model placeholder in `.models.json`

---
 TTS/.models.json | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/TTS/.models.json b/TTS/.models.json
index b926f120..aed546f2 100644
--- a/TTS/.models.json
+++ b/TTS/.models.json
@@ -149,6 +149,17 @@
                     "needs_phonemizer": true
                 }
             }
+        },
+        "jp":{
+            "kokoro":{
+                "tacotron2-DDC":{
+                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.15/tts_models--jp--kokoro--tacotron2-DDC.zip",
+                    "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
+                    "author": "@kaiidams",
+                    "commit": "401fbd89",
+                    "needs_phonemizer": false
+                }
+            }
         }
     },
     "vocoder_models":{

From e66753bd0dc2219c4bac42cb74cb264121296a0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 3 Jun 2021 18:04:28 +0200
Subject: [PATCH 30/36] fixup! new japanese model placeholder in `.models.json`

---
 TTS/.models.json | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/TTS/.models.json b/TTS/.models.json
index aed546f2..310dc5f0 100644
--- a/TTS/.models.json
+++ b/TTS/.models.json
@@ -150,11 +150,12 @@
                 }
             }
         },
-        "jp":{
+        "ja":{
             "kokoro":{
                 "tacotron2-DDC":{
                     "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.15/tts_models--jp--kokoro--tacotron2-DDC.zip",
-                    "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
+                    "default_vocoder": "vocoder_models/universal/libri-tts/wavegrad",
+                    "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
                     "author": "@kaiidams",
                     "commit": "401fbd89",
                     "needs_phonemizer": false

From ba9bcf7c6bdd39d8fbbeef5d34e8c02313569a80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 4 Jun 2021 10:10:51 +0200
Subject: [PATCH 31/36] auto upload to pypi on release

---
 .github/workflows/pypi-release.yml | 38 +++++++++++++
 TTS/VERSION                        |  1 +
 TTS/__init__.py                    |  8 ++-
 TTS/_version.py                    |  1 -
 setup.py                           | 91 +++++++++++++++---------------
 5 files changed, 90 insertions(+), 49 deletions(-)
 create mode 100644 .github/workflows/pypi-release.yml
 create mode 100644 TTS/VERSION
 delete mode 100644 TTS/_version.py

diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml
new file mode 100644
index 00000000..d31e71cf
--- /dev/null
+++ b/.github/workflows/pypi-release.yml
@@ -0,0 +1,38 @@
+name: Publish Python 🐍 distributions 📦 to PyPI
+on:
+  release:
+    types: [published]
+defaults:
+  run:
+    shell:
+      bash
+jobs:
+  build-package:
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v2
+      - name: Verify tag matches version
+        run: |
+          set -ex
+          version=$(cat TTS/VERSION)
+          tag="${GITHUB_REF/refs\/tags\/}"
+          if [[ "v$version" != "$tag" ]]; then
+            exit 1
+          fi
+      - uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+      - run: |
+          python -m pip install -U pip setuptools twine toml
+          python -c 'import toml; c = toml.load("pyproject.toml"); print("\n".join(c["build-system"]["requires"]))' | pip install -r /dev/stdin
+      - run: |
+          python setup.py sdist
+      - name: Setup PyPI config
+        run: |
+          cat << EOF > ~/.pypirc
+          [pypi]
+          username=__token__
+          password=${{ secrets.PYPI_TOKEN }}
+          EOF
+      - run: |
+          twine upload --repository pypi dist/*.tar.gz
diff --git a/TTS/VERSION b/TTS/VERSION
new file mode 100644
index 00000000..13511bd9
--- /dev/null
+++ b/TTS/VERSION
@@ -0,0 +1 @@
+0.0.14.1-alpha.2
diff --git a/TTS/__init__.py b/TTS/__init__.py
index 8dee4bf8..da35faf8 100644
--- a/TTS/__init__.py
+++ b/TTS/__init__.py
@@ -1 +1,7 @@
-from ._version import __version__
+import os
+
+
+with open(os.path.join(os.path.dirname(__file__), "VERSION")) as f:
+    version = f.read().strip()
+
+__version__ = version
diff --git a/TTS/_version.py b/TTS/_version.py
deleted file mode 100644
index 6561790f..00000000
--- a/TTS/_version.py
+++ /dev/null
@@ -1 +0,0 @@
-__version__ = "0.0.15"
diff --git a/setup.py b/setup.py
index a68b09e0..7cfb6519 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,6 @@ import os
 import subprocess
 import sys
 from distutils.version import LooseVersion
-from TTS._version import __version__
 
 import numpy
 import setuptools.command.build_py
@@ -12,82 +11,85 @@ import setuptools.command.develop
 from Cython.Build import cythonize
 from setuptools import Extension, find_packages, setup
 
+
 if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.9"):
-    raise RuntimeError(
-        "TTS requires python >= 3.6 and <3.9 "
-        "but your Python version is {}".format(sys.version)
-    )
+    raise RuntimeError("TTS requires python >= 3.6 and <3.9 " "but your Python version is {}".format(sys.version))
 
 
-version = __version__
 cwd = os.path.dirname(os.path.abspath(__file__))
 
+cwd = os.path.dirname(os.path.abspath(__file__))
+with open(os.path.join(cwd, "TTS", "VERSION")) as fin:
+    version = fin.read().strip()
+
+
 class build_py(setuptools.command.build_py.build_py):  # pylint: disable=too-many-ancestors
     def run(self):
-        self.create_version_file()
         setuptools.command.build_py.build_py.run(self)
 
-    @staticmethod
-    def create_version_file():
-        print('-- Building version ' + version)
-        version_path = os.path.join(cwd, 'version.py')
-        with open(version_path, 'w') as f:
-            f.write("__version__ = '{}'\n".format(version))
 
 class develop(setuptools.command.develop.develop):
     def run(self):
-        build_py.create_version_file()
         setuptools.command.develop.develop.run(self)
 
 
 # The documentation for this feature is in server/README.md
-package_data = ['TTS/server/templates/*']
+package_data = ["TTS/server/templates/*"]
 
 
 def pip_install(package_name):
-    subprocess.call([sys.executable, '-m', 'pip', 'install', package_name])
+    subprocess.call([sys.executable, "-m", "pip", "install", package_name])
 
 
-requirements = open(os.path.join(cwd, 'requirements.txt'), 'r').readlines()
-with open(os.path.join(cwd, 'requirements.notebooks.txt'), 'r') as f:
+requirements = open(os.path.join(cwd, "requirements.txt"), "r").readlines()
+with open(os.path.join(cwd, "requirements.notebooks.txt"), "r") as f:
     requirements_notebooks = f.readlines()
-with open(os.path.join(cwd, 'requirements.dev.txt'), 'r') as f:
+with open(os.path.join(cwd, "requirements.dev.txt"), "r") as f:
     requirements_dev = f.readlines()
-with open(os.path.join(cwd, 'requirements.tf.txt'), 'r') as f:
+with open(os.path.join(cwd, "requirements.tf.txt"), "r") as f:
     requirements_tf = f.readlines()
 requirements_all = requirements_dev + requirements_notebooks + requirements_tf
 
-with open('README.md', "r", encoding="utf-8") as readme_file:
+with open("README.md", "r", encoding="utf-8") as readme_file:
     README = readme_file.read()
 
-exts = [Extension(name='TTS.tts.layers.glow_tts.monotonic_align.core',
-                  sources=["TTS/tts/layers/glow_tts/monotonic_align/core.pyx"])]
+exts = [
+    Extension(
+        name="TTS.tts.layers.glow_tts.monotonic_align.core",
+        sources=["TTS/tts/layers/glow_tts/monotonic_align/core.pyx"],
+    )
+]
 setup(
-    name='TTS',
+    name="TTS",
     version=version,
-    url='https://github.com/coqui-ai/TTS',
-    author='Eren Gölge',
-    author_email='egolge@coqui.ai',
-    description='Deep learning for Text to Speech by Coqui.',
+    url="https://github.com/coqui-ai/TTS",
+    author="Eren Gölge",
+    author_email="egolge@coqui.ai",
+    description="Deep learning for Text to Speech by Coqui.",
     long_description=README,
     long_description_content_type="text/markdown",
-    license='MPL-2.0',
+    license="MPL-2.0",
     # cython
     include_dirs=numpy.get_include(),
     ext_modules=cythonize(exts, language_level=3),
     # ext_modules=find_cython_extensions(),
     # package
     include_package_data=True,
-    packages=find_packages(include=['TTS*']),
+    packages=find_packages(include=["TTS*"]),
+    package_data={
+        "TTS": [
+            "VERSION",
+        ]
+    },
     project_urls={
-        'Documentation': 'https://github.com/coqui-ai/TTS/wiki',
-        'Tracker': 'https://github.com/coqui-ai/TTS/issues',
-        'Repository': 'https://github.com/coqui-ai/TTS',
-        'Discussions': 'https://github.com/coqui-ai/TTS/discussions',
+        "Documentation": "https://github.com/coqui-ai/TTS/wiki",
+        "Tracker": "https://github.com/coqui-ai/TTS/issues",
+        "Repository": "https://github.com/coqui-ai/TTS",
+        "Discussions": "https://github.com/coqui-ai/TTS/discussions",
     },
     cmdclass={
-        'build_py': build_py,
-        'develop': develop,
+        "build_py": build_py,
+        "develop": develop,
         # 'build_ext': build_ext
     },
     install_requires=requirements,
@@ -97,30 +99,25 @@ setup(
         "notebooks": requirements_notebooks,
         "tf": requirements_tf,
     },
-    python_requires='>=3.6.0, <3.9',
-    entry_points={
-        'console_scripts': [
-            'tts=TTS.bin.synthesize:main',
-            'tts-server = TTS.server.server:main'
-        ]
-    },
+    python_requires=">=3.6.0, <3.9",
+    entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]},
     classifiers=[
         "Programming Language :: Python",
         "Programming Language :: Python :: 3",
         "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
-        'Development Status :: 3 - Alpha',
+        "Development Status :: 3 - Alpha",
         "Intended Audience :: Science/Research",
         "Intended Audience :: Developers",
         "Operating System :: POSIX :: Linux",
-        'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)',
+        "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)",
         "Topic :: Software Development",
         "Topic :: Software Development :: Libraries :: Python Modules",
         "Topic :: Multimedia :: Sound/Audio :: Speech",
         "Topic :: Multimedia :: Sound/Audio",
         "Topic :: Multimedia",
-        "Topic :: Scientific/Engineering :: Artificial Intelligence"
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
-    zip_safe=False
+    zip_safe=False,
 )

From 203ab855c316198b084bc7a50fdbccc31e021769 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 4 Jun 2021 13:52:54 +0200
Subject: [PATCH 32/36] bump up to v0.0.15

---
 TTS/VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/VERSION b/TTS/VERSION
index 13511bd9..ceddfb28 100644
--- a/TTS/VERSION
+++ b/TTS/VERSION
@@ -1 +1 @@
-0.0.14.1-alpha.2
+0.0.15

From b8b79a5e5a1f175680a63539cd4235f18cb3ead8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 4 Jun 2021 14:02:53 +0200
Subject: [PATCH 33/36] fix `use_cuda` bug in `server.py`

---
 TTS/server/server.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/TTS/server/server.py b/TTS/server/server.py
index 15a6b292..dc025b32 100644
--- a/TTS/server/server.py
+++ b/TTS/server/server.py
@@ -99,7 +99,9 @@ if args.vocoder_path is not None:
     vocoder_config_path = args.vocoder_config_path
 
 # load models
-synthesizer = Synthesizer(model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, args.use_cuda)
+synthesizer = Synthesizer(
+    model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, use_cuda=args.use_cuda
+)
 
 use_multi_speaker = synthesizer.speaker_manager is not None
 # TODO: set this from SpeakerManager

From ed6e109aecad4a34576ac5288ca4cf57cf0e7141 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 8 Jun 2021 09:17:05 +0200
Subject: [PATCH 34/36] add missing VERSION to manifest

---
 MANIFEST.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MANIFEST.in b/MANIFEST.in
index 664295c7..861cb5a7 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,7 @@
 include README.md
 include LICENSE.txt
 include requirements.*.txt
+include TTS/VERSION
 recursive-include TTS *.json
 recursive-include TTS *.html
 recursive-include TTS *.png

From b0aa18934870cb0120703346c766325af81135bc Mon Sep 17 00:00:00 2001
From: Adam Froghyar <adamfroghyar@gmail.com>
Date: Mon, 14 Jun 2021 10:44:00 +0200
Subject: [PATCH 35/36] Forcing do_trim_silence to False in the extract TTS
 script

---
 TTS/bin/extract_tts_spectrograms.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py
index ace7464a..4eb79d76 100755
--- a/TTS/bin/extract_tts_spectrograms.py
+++ b/TTS/bin/extract_tts_spectrograms.py
@@ -299,4 +299,5 @@ if __name__ == "__main__":
     args = parser.parse_args()
 
     c = load_config(args.config_path)
+    c.audio['do_trim_silence'] = False  # IMPORTANT!!!!!!!!!!!!!!! disable to align mel
     main(args)

From d85ee901d57b4a08301ef569d3c48dd032508ff7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Tue, 15 Jun 2021 10:53:53 +0200
Subject: [PATCH 36/36] Fix #571

---
 TTS/bin/extract_tts_spectrograms.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py
index ace7464a..2be9d760 100755
--- a/TTS/bin/extract_tts_spectrograms.py
+++ b/TTS/bin/extract_tts_spectrograms.py
@@ -299,4 +299,5 @@ if __name__ == "__main__":
     args = parser.parse_args()
 
     c = load_config(args.config_path)
+    C.audio['do_trim_silence'] = False
     main(args)