Japanese Tacotron 2 model

2021-05-22 17:12:19 +09:00 · 2021-05-22 17:12:19 +09:00 · 0536aa6d0f
parent 5482a0f62d
commit 0536aa6d0f
7 changed files with 597 additions and 0 deletions
--- a/TTS/tts/configs/kokoro_tacotron2.json
+++ b/TTS/tts/configs/kokoro_tacotron2.json
@ -0,0 +1,173 @@
+{
+    "model": "Tacotron2",
+    "run_name": "kokoro-ddc",
+    "run_description": "tacotron2 with DDC and differential spectral loss.",
+
+    // AUDIO PARAMETERS
+    "audio":{
+        // stft parameters
+        "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
+        "win_length": 1024,      // stft window length in ms.
+        "hop_length": 256,       // stft window hop-lengh in ms.
+        "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
+
+        // Audio processing parameters
+        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate.
+        "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+
+        // Silence trimming
+        "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+
+        // Griffin-Lim
+        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
+        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+
+        // MelSpectrogram parameters
+        "num_mels": 80,         // size of the mel spec frame.
+        "mel_fmin": 50.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 7600.0,     // maximum freq level for mel-spec. Tune for dataset!!
+        "spec_gain": 1,
+
+        // Normalization parameters
+        "signal_norm": true,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
+        "min_level_db": -100,   // lower bound for normalization
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "stats_path": "./scale_stats.npy"    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+    },
+
+    // VOCABULARY PARAMETERS
+    // if custom character set is not defined,
+    // default set in symbols.py is used
+    "characters":{
+        "pad": "_",
+        "eos": "~",
+        "bos": "^",
+        "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
+        "punctuations": "!'(),-.:;? ",
+        "phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+    },
+
+    // DISTRIBUTED TRAINING
+    "distributed":{
+        "backend": "nccl",
+        "url": "tcp:\/\/localhost:54321"
+    },
+
+    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
+
+    // TRAINING
+    "batch_size": 32,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "eval_batch_size":16,
+    "r": 7,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
+    "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
+    "mixed_precision": true,     // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate.
+
+    // LOSS SETTINGS
+    "loss_masking": true,       // enable / disable loss masking against the sequence padding.
+    "decoder_loss_alpha": 0.5,  // original decoder loss weight. If > 0, it is enabled
+    "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled
+    "postnet_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "decoder_diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled
+    "decoder_ssim_alpha": 0.5,     // decoder ssim loss weight. If > 0, it is enabled
+    "postnet_ssim_alpha": 0.25,     // postnet ssim loss weight. If > 0, it is enabled
+    "ga_alpha": 5.0,           // weight for guided attention loss. If > 0, guided attention is enabled.
+    "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples.
+
+
+    // VALIDATION
+    "run_eval": true,
+    "test_delay_epochs": 10,  //Until attention is aligned, testing only wastes computation time.
+    "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
+
+    // OPTIMIZER
+    "noam_schedule": false,        // use noam warmup and lr schedule.
+    "grad_clip": 1.0,              // upper limit for gradients for clipping.
+    "epochs": 1000,                // total number of epochs to train.
+    "lr": 0.0001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "wd": 0.000001,                // Weight decay weight.
+    "warmup_steps": 4000,          // Noam decay steps to increase the learning rate from 0 to "lr"
+    "seq_len_norm": false,         // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
+
+    // TACOTRON PRENET
+    "memory_size": -1,             // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame.
+    "prenet_type": "original",     // "original" or "bn".
+    "prenet_dropout": true,       // enable/disable dropout at prenet.
+
+    // TACOTRON ATTENTION
+    "attention_type": "original",  // 'original' , 'graves', 'dynamic_convolution'
+    "attention_heads": 4,          // number of attention heads (only for 'graves')
+    "attention_norm": "sigmoid",   // softmax or sigmoid.
+    "windowing": false,            // Enables attention windowing. Used only in eval mode.
+    "use_forward_attn": false,     // if it uses forward attention. In general, it aligns faster.
+    "forward_attn_mask": false,    // Additional masking forcing monotonicity only in eval mode.
+    "transition_agent": false,     // enable/disable transition agent of forward attention.
+    "location_attn": true,         // enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+    "bidirectional_decoder": false,  // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset.
+    "double_decoder_consistency": true,  // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/
+    "ddc_r": 7,                           // reduction rate for coarse decoder.
+
+    // STOPNET
+    "stopnet": true,               // Train stopnet predicting the end of synthesis.
+    "separate_stopnet": true,      // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
+
+    // TENSORBOARD and LOGGING
+    "print_step": 25,       // Number of steps to log training on console.
+    "tb_plot_step": 100,    // Number of steps to plot TB training figures.
+    "print_eval": false,     // If True, it prints intermediate loss values in evalulation.
+    "save_step": 10000,      // Number of training steps expected to save traninpg stats and checkpoints.
+    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
+    "keep_all_best": false,  // If true, keeps all best_models after keep_after steps
+    "keep_after": 10000,    // Global step after which to keep best models if keep_all_best is true
+    "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+
+    // DATA LOADING
+    "text_cleaner": "basic_cleaners",
+    "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
+    "num_loader_workers": 4,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "num_val_loader_workers": 4,    // number of evaluation data loader processes.
+    "batch_group_size": 4,  //Number of batches to shuffle after bucketing.
+    "min_seq_len": 6,       // DATASET-RELATED: minimum text length to use in training
+    "max_seq_len": 153,     // DATASET-RELATED: maximum text length
+    "compute_input_seq_cache": false,  // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage.
+    "use_noise_augment": true,
+
+    // PATHS
+    "output_path": "./Models/Kokoro/",
+
+    // PHONEMES
+    "phoneme_cache_path": "./phoneme_cache/",  // phoneme computation is slow, therefore, it caches results in the given folder.
+    "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
+    "phoneme_language": "ja-jp",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
+
+    // MULTI-SPEAKER and GST
+    "use_speaker_embedding": false,      // use speaker embedding to enable multi-speaker learning.
+    "use_gst": false,       			    // use global style tokens
+    "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
+    "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
+    "gst":	{			                // gst parameter if gst is enabled
+        "gst_style_input": null,        // Condition the style input either on a
+                                        // -> wave file [path to wave] or
+                                        // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
+                                        // with the dictionary being len(dict) <= len(gst_style_tokens).
+        "gst_embedding_dim": 512,
+        "gst_num_heads": 4,
+        "gst_style_tokens": 10,
+        "gst_use_speaker_embedding": false
+	},
+
+    // DATASETS
+    "datasets":   // List of datasets. They all merged and they get different speaker_ids.
+        [
+            {
+                "name": "kokoro",
+                "path": "./kokoro-speech-v1_1-small/",
+                "meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers
+                "meta_file_val": null
+            }
+        ]
+}
--- a/TTS/tts/datasets/preprocess.py
+++ b/TTS/tts/datasets/preprocess.py
@ -424,3 +424,17 @@ def baker(root_path: str, meta_file: str) -> List[List[str]]:
            wav_path = os.path.join(root_path, "clips_22", wav_name)
            items.append([text, wav_path, speaker_name])
    return items
+
+
+def kokoro(root_path, meta_file):
+    """Japanese single-speaker dataset from https://github.com/kaiidams/Kokoro-Speech-Dataset"""
+    txt_file = os.path.join(root_path, meta_file)
+    items = []
+    speaker_name = "kokoro"
+    with open(txt_file, "r") as ttf:
+        for line in ttf:
+            cols = line.split("|")
+            wav_file = os.path.join(root_path, "wavs", cols[0] + '.wav')
+            text = cols[2].replace(" ", "")
+            items.append([text, wav_file, speaker_name])
+    return items
--- a/TTS/tts/utils/japanese/init.py
+++ b/TTS/tts/utils/japanese/init.py
@ -0,0 +1 @@
+from .text import japanese_text2phone
--- a/TTS/tts/utils/japanese/text.py
+++ b/TTS/tts/utils/japanese/text.py
@ -0,0 +1,380 @@
+# Convert Japanese text to phonemes which is
+# compatible with Julius https://github.com/julius-speech/segmentation-kit
+
+import re
+import MeCab
+from typing import List, Tuple
+
+_CONVRULES = [
+    # Conversion of 2 letters
+    'アァ/ a a',
+    'イィ/ i i',
+    'イェ/ i e',
+    'イャ/ y a',
+    'ウゥ/ u:',
+    'エェ/ e e',
+    'オォ/ o:',
+    'カァ/ k a:',
+    'キィ/ k i:',
+    'クゥ/ k u:',
+    'クャ/ ky a',
+    'クュ/ ky u',
+    'クョ/ ky o',
+    'ケェ/ k e:',
+    'コォ/ k o:',
+    'ガァ/ g a:',
+    'ギィ/ g i:',
+    'グゥ/ g u:',
+    'グャ/ gy a',
+    'グュ/ gy u',
+    'グョ/ gy o',
+    'ゲェ/ g e:',
+    'ゴォ/ g o:',
+    'サァ/ s a:',
+    'シィ/ sh i:',
+    'スゥ/ s u:',
+    'スャ/ sh a',
+    'スュ/ sh u',
+    'スョ/ sh o',
+    'セェ/ s e:',
+    'ソォ/ s o:',
+    'ザァ/ z a:',
+    'ジィ/ j i:',
+    'ズゥ/ z u:',
+    'ズャ/ zy a',
+    'ズュ/ zy u',
+    'ズョ/ zy o',
+    'ゼェ/ z e:',
+    'ゾォ/ z o:',
+    'タァ/ t a:',
+    'チィ/ ch i:',
+    'ツァ/ ts a',
+    'ツィ/ ts i',
+    'ツゥ/ ts u:',
+    'ツャ/ ch a',
+    'ツュ/ ch u',
+    'ツョ/ ch o',
+    'ツェ/ ts e',
+    'ツォ/ ts o',
+    'テェ/ t e:',
+    'トォ/ t o:',
+    'ダァ/ d a:',
+    'ヂィ/ j i:',
+    'ヅゥ/ d u:',
+    'ヅャ/ zy a',
+    'ヅュ/ zy u',
+    'ヅョ/ zy o',
+    'デェ/ d e:',
+    'ドォ/ d o:',
+    'ナァ/ n a:',
+    'ニィ/ n i:',
+    'ヌゥ/ n u:',
+    'ヌャ/ ny a',
+    'ヌュ/ ny u',
+    'ヌョ/ ny o',
+    'ネェ/ n e:',
+    'ノォ/ n o:',
+    'ハァ/ h a:',
+    'ヒィ/ h i:',
+    'フゥ/ f u:',
+    'フャ/ hy a',
+    'フュ/ hy u',
+    'フョ/ hy o',
+    'ヘェ/ h e:',
+    'ホォ/ h o:',
+    'バァ/ b a:',
+    'ビィ/ b i:',
+    'ブゥ/ b u:',
+    'フャ/ hy a',
+    'ブュ/ by u',
+    'フョ/ hy o',
+    'ベェ/ b e:',
+    'ボォ/ b o:',
+    'パァ/ p a:',
+    'ピィ/ p i:',
+    'プゥ/ p u:',
+    'プャ/ py a',
+    'プュ/ py u',
+    'プョ/ py o',
+    'ペェ/ p e:',
+    'ポォ/ p o:',
+    'マァ/ m a:',
+    'ミィ/ m i:',
+    'ムゥ/ m u:',
+    'ムャ/ my a',
+    'ムュ/ my u',
+    'ムョ/ my o',
+    'メェ/ m e:',
+    'モォ/ m o:',
+    'ヤァ/ y a:',
+    'ユゥ/ y u:',
+    'ユャ/ y a:',
+    'ユュ/ y u:',
+    'ユョ/ y o:',
+    'ヨォ/ y o:',
+    'ラァ/ r a:',
+    'リィ/ r i:',
+    'ルゥ/ r u:',
+    'ルャ/ ry a',
+    'ルュ/ ry u',
+    'ルョ/ ry o',
+    'レェ/ r e:',
+    'ロォ/ r o:',
+    'ワァ/ w a:',
+    'ヲォ/ o:',
+    'ディ/ d i',
+    'デェ/ d e:',
+    'デャ/ dy a',
+    'デュ/ dy u',
+    'デョ/ dy o',
+    'ティ/ t i',
+    'テェ/ t e:',
+    'テャ/ ty a',
+    'テュ/ ty u',
+    'テョ/ ty o',
+    'スィ/ s i',
+    'ズァ/ z u a',
+    'ズィ/ z i',
+    'ズゥ/ z u',
+    'ズャ/ zy a',
+    'ズュ/ zy u',
+    'ズョ/ zy o',
+    'ズェ/ z e',
+    'ズォ/ z o',
+    'キャ/ ky a',
+    'キュ/ ky u',
+    'キョ/ ky o',
+    'シャ/ sh a',
+    'シュ/ sh u',
+    'シェ/ sh e',
+    'ショ/ sh o',
+    'チャ/ ch a',
+    'チュ/ ch u',
+    'チェ/ ch e',
+    'チョ/ ch o',
+    'トゥ/ t u',
+    'トャ/ ty a',
+    'トュ/ ty u',
+    'トョ/ ty o',
+    'ドァ/ d o a',
+    'ドゥ/ d u',
+    'ドャ/ dy a',
+    'ドュ/ dy u',
+    'ドョ/ dy o',
+    'ドォ/ d o:',
+    'ニャ/ ny a',
+    'ニュ/ ny u',
+    'ニョ/ ny o',
+    'ヒャ/ hy a',
+    'ヒュ/ hy u',
+    'ヒョ/ hy o',
+    'ミャ/ my a',
+    'ミュ/ my u',
+    'ミョ/ my o',
+    'リャ/ ry a',
+    'リュ/ ry u',
+    'リョ/ ry o',
+    'ギャ/ gy a',
+    'ギュ/ gy u',
+    'ギョ/ gy o',
+    'ヂェ/ j e',
+    'ヂャ/ j a',
+    'ヂュ/ j u',
+    'ヂョ/ j o',
+    'ジェ/ j e',
+    'ジャ/ j a',
+    'ジュ/ j u',
+    'ジョ/ j o',
+    'ビャ/ by a',
+    'ビュ/ by u',
+    'ビョ/ by o',
+    'ピャ/ py a',
+    'ピュ/ py u',
+    'ピョ/ py o',
+    'ウァ/ u a',
+    'ウィ/ w i',
+    'ウェ/ w e',
+    'ウォ/ w o',
+    'ファ/ f a',
+    'フィ/ f i',
+    'フゥ/ f u',
+    'フャ/ hy a',
+    'フュ/ hy u',
+    'フョ/ hy o',
+    'フェ/ f e',
+    'フォ/ f o',
+    'ヴァ/ b a',
+    'ヴィ/ b i',
+    'ヴェ/ b e',
+    'ヴォ/ b o',
+    'ヴュ/ by u',
+
+    # Conversion of 1 letter
+    'ア/ a',
+    'イ/ i',
+    'ウ/ u',
+    'エ/ e',
+    'オ/ o',
+    'カ/ k a',
+    'キ/ k i',
+    'ク/ k u',
+    'ケ/ k e',
+    'コ/ k o',
+    'サ/ s a',
+    'シ/ sh i',
+    'ス/ s u',
+    'セ/ s e',
+    'ソ/ s o',
+    'タ/ t a',
+    'チ/ ch i',
+    'ツ/ ts u',
+    'テ/ t e',
+    'ト/ t o',
+    'ナ/ n a',
+    'ニ/ n i',
+    'ヌ/ n u',
+    'ネ/ n e',
+    'ノ/ n o',
+    'ハ/ h a',
+    'ヒ/ h i',
+    'フ/ f u',
+    'ヘ/ h e',
+    'ホ/ h o',
+    'マ/ m a',
+    'ミ/ m i',
+    'ム/ m u',
+    'メ/ m e',
+    'モ/ m o',
+    'ラ/ r a',
+    'リ/ r i',
+    'ル/ r u',
+    'レ/ r e',
+    'ロ/ r o',
+    'ガ/ g a',
+    'ギ/ g i',
+    'グ/ g u',
+    'ゲ/ g e',
+    'ゴ/ g o',
+    'ザ/ z a',
+    'ジ/ j i',
+    'ズ/ z u',
+    'ゼ/ z e',
+    'ゾ/ z o',
+    'ダ/ d a',
+    'ヂ/ j i',
+    'ヅ/ z u',
+    'デ/ d e',
+    'ド/ d o',
+    'バ/ b a',
+    'ビ/ b i',
+    'ブ/ b u',
+    'ベ/ b e',
+    'ボ/ b o',
+    'パ/ p a',
+    'ピ/ p i',
+    'プ/ p u',
+    'ペ/ p e',
+    'ポ/ p o',
+    'ヤ/ y a',
+    'ユ/ y u',
+    'ヨ/ y o',
+    'ワ/ w a',
+    'ヰ/ i',
+    'ヱ/ e',
+    'ヲ/ o',
+    'ン/ N',
+    'ッ/ q',
+    'ヴ/ b u',
+    'ー/:',
+
+    # Try converting broken text
+    'ァ/ a',
+    'ィ/ i',
+    'ゥ/ u',
+    'ェ/ e',
+    'ォ/ o',
+    'ヮ/ w a',
+    'ォ/ o',
+
+    # Symbols
+    '、/ ,',
+    '。/ .',
+    '！/ !',
+    '？/ ?',
+    '・/ ,'
+]
+
+_COLON_RX = re.compile(':+')
+_REJECT_RX = re.compile('[^ a-zA-Z:,.?]')
+
+def _makerulemap():
+    l = [tuple(x.split('/')) for x in _CONVRULES]
+    return tuple(
+        {k: v for k, v in l if len(k) == i}
+        for i in (1, 2)
+    )
+
+_RULEMAP1, _RULEMAP2 = _makerulemap()
+
+def kata2phoneme(text: str) -> str:
+    """Convert katakana text to phonemes.
+    """
+    text = text.strip()
+    res = ''
+    while text:
+        if len(text) >= 2:
+            x = _RULEMAP2.get(text[:2])
+            if x is not None:
+                text = text[2:]
+                res += x
+                continue
+        x = _RULEMAP1.get(text[0])
+        if x is not None:
+            text = text[1:]
+            res += x
+            continue
+        res += ' ' + text[0]
+        text = text[1:]
+    res = _COLON_RX.sub(':', res)
+    return res[1:]
+
+_KATAKANA = ''.join(chr(ch) for ch in range(ord('ァ'), ord('ン') + 1))
+_HIRAGANA = ''.join(chr(ch) for ch in range(ord('ぁ'), ord('ん') + 1))
+_HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA)
+
+def hira2kata(text: str) -> str:
+    text = text.translate(_HIRA2KATATRANS)
+    return text.replace('う゛', 'ヴ')
+
+_SYMBOL_TOKENS = set(list('・、。？！'))
+_NO_YOMI_TOKENS = set(list('「」『』―（）［］[]　…'))
+_TAGGER = MeCab.Tagger()
+
+def text2kata(text: str) -> str:
+    parsed = _TAGGER.parse(text)
+    res = []
+    for line in parsed.split('\n'):
+        if line == 'EOS':
+            break
+        parts = line.split('\t')
+
+        word, yomi = parts[0], parts[1]
+        if yomi:
+            res.append(yomi)
+        else:
+            if word in _SYMBOL_TOKENS:
+                res.append(word)
+            elif word == 'っ' or word == 'ッ':
+                res.append('ッ')
+            elif word in _NO_YOMI_TOKENS:
+                pass
+            else:
+                res.append(word)
+    return hira2kata(''.join(res))
+
+def japanese_text2phone(text: str) -> str:
+    """Convert Japanese text to phonemes.
+    """
+    res = text2kata(text)
+    res = kata2phoneme(res)
+    return res.replace(' ', '')
--- a/TTS/tts/utils/japanese/text_test.py
+++ b/TTS/tts/utils/japanese/text_test.py
@ -0,0 +1,22 @@
+import unittest
+from . import japanese_text2phone
+
+_TEST_CASES = '''
+どちらに行きますか？/dochiraniikimasuka?
+今日は温泉に、行きます。/kyo:waoNseNni,ikimasu.
+「A」から「Z」までです。/AkaraZmadedesu.
+そうですね！/so:desune!
+クジラは哺乳類です。/kujirawahonyu:ruidesu.
+ヴィディオを見ます。/bidioomimasu.
+ky o: w a o N s e N n i , i k i m a s u ./kyo:waoNseNni,ikimasu.
+'''
+
+class TestText(unittest.TestCase):
+
+    def test_text2phone(self):
+        for line in _TEST_CASES.strip().split('\n'):
+            text, phone = line.split('/')
+            self.assertEqual(japanese_text2phone(text), phone)
+
+if __name__ == '__main__':
+    unittest.main()
--- a/TTS/tts/utils/text/init.py
+++ b/TTS/tts/utils/text/init.py
@ -39,6 +39,11 @@ def text2phone(text, language):
    if language == "zh-CN":
        ph = chinese_text_to_phonemes(text)
        return ph
+    elif language == "ja-jp":
+        from TTS.tts.utils.japanese import japanese_text2phone
+        ph = japanese_text2phone(text)
+        return ph
+
    raise ValueError(f" [!] Language {language} is not supported for phonemization.")


--- a/requirements.txt
+++ b/requirements.txt
@ -19,3 +19,5 @@ numba==0.52
 umap-learn==0.4.6
 unidecode==0.4.20
 coqpit
+mecab-python3
+unidic-lite