Merge pull request #502 from kaiidams/kaiidams/kokoro

Japanese Tacotron 2 model
2021-06-02 10:20:08 +02:00 · 2021-06-02 10:20:08 +02:00 · 73b4083c6c
parent ea31215087 6d8310d2a9
commit 73b4083c6c
9 changed files with 572 additions and 1 deletions
--- a/TTS/_version.py
+++ b/TTS/_version.py
@ -1 +1 @@
-__version__ = "0.0.14.1"
+__version__ = "0.0.14"
--- a/TTS/tts/datasets/preprocess.py
+++ b/TTS/tts/datasets/preprocess.py
@ -424,3 +424,17 @@ def baker(root_path: str, meta_file: str) -> List[List[str]]:
            wav_path = os.path.join(root_path, "clips_22", wav_name)
            items.append([text, wav_path, speaker_name])
    return items
 def kokoro(root_path, meta_file):
    """Japanese single-speaker dataset from https://github.com/kaiidams/Kokoro-Speech-Dataset"""
    txt_file = os.path.join(root_path, meta_file)
    items = []
    speaker_name = "kokoro"
    with open(txt_file, "r") as ttf:
        for line in ttf:
            cols = line.split("|")
            wav_file = os.path.join(root_path, "wavs", cols[0] + '.wav')
            text = cols[2].replace(" ", "")
            items.append([text, wav_file, speaker_name])
    return items
--- a/TTS/tts/utils/text/init.py
+++ b/TTS/tts/utils/text/init.py
@ -6,6 +6,7 @@ from packaging import version
 from TTS.tts.utils.text import cleaners
 from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes
 from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
 from TTS.tts.utils.text.symbols import _bos, _eos, _punctuations, make_symbols, phonemes, symbols
 # pylint: disable=unnecessary-comprehension
@ -39,6 +40,11 @@ def text2phone(text, language):
    if language == "zh-CN":
        ph = chinese_text_to_phonemes(text)
        return ph
    if language == "ja-jp":
        ph = japanese_text_to_phonemes(text)
        return ph
    raise ValueError(f" [!] Language {language} is not supported for phonemization.")
--- a/TTS/tts/utils/text/japanese/init.py
+++ b/TTS/tts/utils/text/japanese/init.py
--- a/TTS/tts/utils/text/japanese/phonemizer.py
+++ b/TTS/tts/utils/text/japanese/phonemizer.py
@ -0,0 +1,379 @@
 # Convert Japanese text to phonemes which is
 # compatible with Julius https://github.com/julius-speech/segmentation-kit
 import re
 import MeCab
 _CONVRULES = [
    # Conversion of 2 letters
    'アァ/ a a',
    'イィ/ i i',
    'イェ/ i e',
    'イャ/ y a',
    'ウゥ/ u:',
    'エェ/ e e',
    'オォ/ o:',
    'カァ/ k a:',
    'キィ/ k i:',
    'クゥ/ k u:',
    'クャ/ ky a',
    'クュ/ ky u',
    'クョ/ ky o',
    'ケェ/ k e:',
    'コォ/ k o:',
    'ガァ/ g a:',
    'ギィ/ g i:',
    'グゥ/ g u:',
    'グャ/ gy a',
    'グュ/ gy u',
    'グョ/ gy o',
    'ゲェ/ g e:',
    'ゴォ/ g o:',
    'サァ/ s a:',
    'シィ/ sh i:',
    'スゥ/ s u:',
    'スャ/ sh a',
    'スュ/ sh u',
    'スョ/ sh o',
    'セェ/ s e:',
    'ソォ/ s o:',
    'ザァ/ z a:',
    'ジィ/ j i:',
    'ズゥ/ z u:',
    'ズャ/ zy a',
    'ズュ/ zy u',
    'ズョ/ zy o',
    'ゼェ/ z e:',
    'ゾォ/ z o:',
    'タァ/ t a:',
    'チィ/ ch i:',
    'ツァ/ ts a',
    'ツィ/ ts i',
    'ツゥ/ ts u:',
    'ツャ/ ch a',
    'ツュ/ ch u',
    'ツョ/ ch o',
    'ツェ/ ts e',
    'ツォ/ ts o',
    'テェ/ t e:',
    'トォ/ t o:',
    'ダァ/ d a:',
    'ヂィ/ j i:',
    'ヅゥ/ d u:',
    'ヅャ/ zy a',
    'ヅュ/ zy u',
    'ヅョ/ zy o',
    'デェ/ d e:',
    'ドォ/ d o:',
    'ナァ/ n a:',
    'ニィ/ n i:',
    'ヌゥ/ n u:',
    'ヌャ/ ny a',
    'ヌュ/ ny u',
    'ヌョ/ ny o',
    'ネェ/ n e:',
    'ノォ/ n o:',
    'ハァ/ h a:',
    'ヒィ/ h i:',
    'フゥ/ f u:',
    'フャ/ hy a',
    'フュ/ hy u',
    'フョ/ hy o',
    'ヘェ/ h e:',
    'ホォ/ h o:',
    'バァ/ b a:',
    'ビィ/ b i:',
    'ブゥ/ b u:',
    'フャ/ hy a',
    'ブュ/ by u',
    'フョ/ hy o',
    'ベェ/ b e:',
    'ボォ/ b o:',
    'パァ/ p a:',
    'ピィ/ p i:',
    'プゥ/ p u:',
    'プャ/ py a',
    'プュ/ py u',
    'プョ/ py o',
    'ペェ/ p e:',
    'ポォ/ p o:',
    'マァ/ m a:',
    'ミィ/ m i:',
    'ムゥ/ m u:',
    'ムャ/ my a',
    'ムュ/ my u',
    'ムョ/ my o',
    'メェ/ m e:',
    'モォ/ m o:',
    'ヤァ/ y a:',
    'ユゥ/ y u:',
    'ユャ/ y a:',
    'ユュ/ y u:',
    'ユョ/ y o:',
    'ヨォ/ y o:',
    'ラァ/ r a:',
    'リィ/ r i:',
    'ルゥ/ r u:',
    'ルャ/ ry a',
    'ルュ/ ry u',
    'ルョ/ ry o',
    'レェ/ r e:',
    'ロォ/ r o:',
    'ワァ/ w a:',
    'ヲォ/ o:',
    'ディ/ d i',
    'デェ/ d e:',
    'デャ/ dy a',
    'デュ/ dy u',
    'デョ/ dy o',
    'ティ/ t i',
    'テェ/ t e:',
    'テャ/ ty a',
    'テュ/ ty u',
    'テョ/ ty o',
    'スィ/ s i',
    'ズァ/ z u a',
    'ズィ/ z i',
    'ズゥ/ z u',
    'ズャ/ zy a',
    'ズュ/ zy u',
    'ズョ/ zy o',
    'ズェ/ z e',
    'ズォ/ z o',
    'キャ/ ky a',
    'キュ/ ky u',
    'キョ/ ky o',
    'シャ/ sh a',
    'シュ/ sh u',
    'シェ/ sh e',
    'ショ/ sh o',
    'チャ/ ch a',
    'チュ/ ch u',
    'チェ/ ch e',
    'チョ/ ch o',
    'トゥ/ t u',
    'トャ/ ty a',
    'トュ/ ty u',
    'トョ/ ty o',
    'ドァ/ d o a',
    'ドゥ/ d u',
    'ドャ/ dy a',
    'ドュ/ dy u',
    'ドョ/ dy o',
    'ドォ/ d o:',
    'ニャ/ ny a',
    'ニュ/ ny u',
    'ニョ/ ny o',
    'ヒャ/ hy a',
    'ヒュ/ hy u',
    'ヒョ/ hy o',
    'ミャ/ my a',
    'ミュ/ my u',
    'ミョ/ my o',
    'リャ/ ry a',
    'リュ/ ry u',
    'リョ/ ry o',
    'ギャ/ gy a',
    'ギュ/ gy u',
    'ギョ/ gy o',
    'ヂェ/ j e',
    'ヂャ/ j a',
    'ヂュ/ j u',
    'ヂョ/ j o',
    'ジェ/ j e',
    'ジャ/ j a',
    'ジュ/ j u',
    'ジョ/ j o',
    'ビャ/ by a',
    'ビュ/ by u',
    'ビョ/ by o',
    'ピャ/ py a',
    'ピュ/ py u',
    'ピョ/ py o',
    'ウァ/ u a',
    'ウィ/ w i',
    'ウェ/ w e',
    'ウォ/ w o',
    'ファ/ f a',
    'フィ/ f i',
    'フゥ/ f u',
    'フャ/ hy a',
    'フュ/ hy u',
    'フョ/ hy o',
    'フェ/ f e',
    'フォ/ f o',
    'ヴァ/ b a',
    'ヴィ/ b i',
    'ヴェ/ b e',
    'ヴォ/ b o',
    'ヴュ/ by u',
    # Conversion of 1 letter
    'ア/ a',
    'イ/ i',
    'ウ/ u',
    'エ/ e',
    'オ/ o',
    'カ/ k a',
    'キ/ k i',
    'ク/ k u',
    'ケ/ k e',
    'コ/ k o',
    'サ/ s a',
    'シ/ sh i',
    'ス/ s u',
    'セ/ s e',
    'ソ/ s o',
    'タ/ t a',
    'チ/ ch i',
    'ツ/ ts u',
    'テ/ t e',
    'ト/ t o',
    'ナ/ n a',
    'ニ/ n i',
    'ヌ/ n u',
    'ネ/ n e',
    'ノ/ n o',
    'ハ/ h a',
    'ヒ/ h i',
    'フ/ f u',
    'ヘ/ h e',
    'ホ/ h o',
    'マ/ m a',
    'ミ/ m i',
    'ム/ m u',
    'メ/ m e',
    'モ/ m o',
    'ラ/ r a',
    'リ/ r i',
    'ル/ r u',
    'レ/ r e',
    'ロ/ r o',
    'ガ/ g a',
    'ギ/ g i',
    'グ/ g u',
    'ゲ/ g e',
    'ゴ/ g o',
    'ザ/ z a',
    'ジ/ j i',
    'ズ/ z u',
    'ゼ/ z e',
    'ゾ/ z o',
    'ダ/ d a',
    'ヂ/ j i',
    'ヅ/ z u',
    'デ/ d e',
    'ド/ d o',
    'バ/ b a',
    'ビ/ b i',
    'ブ/ b u',
    'ベ/ b e',
    'ボ/ b o',
    'パ/ p a',
    'ピ/ p i',
    'プ/ p u',
    'ペ/ p e',
    'ポ/ p o',
    'ヤ/ y a',
    'ユ/ y u',
    'ヨ/ y o',
    'ワ/ w a',
    'ヰ/ i',
    'ヱ/ e',
    'ヲ/ o',
    'ン/ N',
    'ッ/ q',
    'ヴ/ b u',
    'ー/:',
    # Try converting broken text
    'ァ/ a',
    'ィ/ i',
    'ゥ/ u',
    'ェ/ e',
    'ォ/ o',
    'ヮ/ w a',
    'ォ/ o',
    # Symbols
    '、/ ,',
    '。/ .',
    '！/ !',
    '？/ ?',
    '・/ ,'
 ]
 _COLON_RX = re.compile(':+')
 _REJECT_RX = re.compile('[^ a-zA-Z:,.?]')
 def _makerulemap():
    l = [tuple(x.split('/')) for x in _CONVRULES]
    return tuple(
        {k: v for k, v in l if len(k) == i}
        for i in (1, 2)
    )
 _RULEMAP1, _RULEMAP2 = _makerulemap()
 def kata2phoneme(text: str) -> str:
    """Convert katakana text to phonemes.
    """
    text = text.strip()
    res = ''
    while text:
        if len(text) >= 2:
            x = _RULEMAP2.get(text[:2])
            if x is not None:
                text = text[2:]
                res += x
                continue
        x = _RULEMAP1.get(text[0])
        if x is not None:
            text = text[1:]
            res += x
            continue
        res += ' ' + text[0]
        text = text[1:]
    res = _COLON_RX.sub(':', res)
    return res[1:]
 _KATAKANA = ''.join(chr(ch) for ch in range(ord('ァ'), ord('ン') + 1))
 _HIRAGANA = ''.join(chr(ch) for ch in range(ord('ぁ'), ord('ん') + 1))
 _HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA)
 def hira2kata(text: str) -> str:
    text = text.translate(_HIRA2KATATRANS)
    return text.replace('う゛', 'ヴ')
 _SYMBOL_TOKENS = set(list('・、。？！'))
 _NO_YOMI_TOKENS = set(list('「」『』―（）［］[]　…'))
 _TAGGER = MeCab.Tagger()
 def text2kata(text: str) -> str:
    parsed = _TAGGER.parse(text)
    res = []
    for line in parsed.split('\n'):
        if line == 'EOS':
            break
        parts = line.split('\t')
        word, yomi = parts[0], parts[1]
        if yomi:
            res.append(yomi)
        else:
            if word in _SYMBOL_TOKENS:
                res.append(word)
            elif word in ('っ', 'ッ'):
                res.append('ッ')
            elif word in _NO_YOMI_TOKENS:
                pass
            else:
                res.append(word)
    return hira2kata(''.join(res))
 def japanese_text_to_phonemes(text: str) -> str:
    """Convert Japanese text to phonemes.
    """
    res = text2kata(text)
    res = kata2phoneme(res)
    return res.replace(' ', '')
--- a/recipes/kokoro/tacotron2-DDC/run.sh
+++ b/recipes/kokoro/tacotron2-DDC/run.sh
@ -0,0 +1,23 @@
 #!/bin/bash
 # take the scripts's parent's directory to prefix all the output paths.
 RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
 CORPUS=kokoro-speech-v1_1-small
 echo $RUN_DIR
 if [ \! -d $RUN_DIR/$CORPUS ] ; then
    echo "$RUN_DIR/$CORPUS doesn't exist."
    echo "Follow the instruction of https://github.com/kaiidams/Kokoro-Speech-Dataset to make the corpus."
    exit 1
 fi
 # create train-val splits
 shuf $RUN_DIR/$CORPUS/metadata.csv > $RUN_DIR/$CORPUS/metadata_shuf.csv
 head -n 8000 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_train.csv
 tail -n 812 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_val.csv
 # compute dataset mean and variance for normalization
 python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/$CORPUS/wavs/
 # training ....
 # change the GPU id if needed
 CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tacotron.py --config_path $RUN_DIR/tacotron2-DDC.json \
                                                          --coqpit.output_path $RUN_DIR \
                                                          --coqpit.datasets.0.path $RUN_DIR/$CORPUS \
                                                          --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \
                                                          --coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \
--- a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
+++ b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
@ -0,0 +1,125 @@
 {
    "datasets": [
        {
            "name": "kokoro",
            "path": "DEFINE THIS",
            "meta_file_train": "metadata.csv",
            "meta_file_val": null
        }
    ],
    "audio": {
        "fft_size": 1024,
        "win_length": 1024,
        "hop_length": 256,
        "frame_length_ms": null,
        "frame_shift_ms": null,
        "sample_rate": 22050,
        "preemphasis": 0.0,
        "ref_level_db": 20,
        "do_trim_silence": true,
        "trim_db": 60,
        "power": 1.5,
        "griffin_lim_iters": 60,
        "num_mels": 80,
        "mel_fmin": 50.0,
        "mel_fmax": 7600.0,
        "spec_gain": 1,
        "signal_norm": true,
        "min_level_db": -100,
        "symmetric_norm": true,
        "max_norm": 4.0,
        "clip_norm": true,
        "stats_path": "scale_stats.npy"
    },
    "gst":{
        "gst_style_input": null,
        "gst_embedding_dim": 512,
        "gst_num_heads": 4,
        "gst_style_tokens": 10,
        "gst_use_speaker_embedding": false
 	},
    "model": "Tacotron2",
    "run_name": "kokoro-ddc",
    "run_description": "tacotron2 with DDC and differential spectral loss.",
    "batch_size": 32,
    "eval_batch_size": 16,
    "mixed_precision": true,
    "distributed": {
        "backend": "nccl",
        "url": "tcp:\/\/localhost:54321"
    },
    "reinit_layers": [],
    "loss_masking": true,
    "decoder_loss_alpha": 0.5,
    "postnet_loss_alpha": 0.25,
    "postnet_diff_spec_alpha": 0.25,
    "decoder_diff_spec_alpha": 0.25,
    "decoder_ssim_alpha": 0.5,
    "postnet_ssim_alpha": 0.25,
    "ga_alpha": 5.0,
    "stopnet_pos_weight": 15.0,
    "run_eval": true,
    "test_delay_epochs": 10,
    "test_sentences_file": null,
    "noam_schedule": false,
    "grad_clip": 1.0,
    "epochs": 1000,
    "lr": 0.0001,
    "wd": 0.000001,
    "warmup_steps": 4000,
    "seq_len_norm": false,
    "memory_size": -1,
    "prenet_type": "original",
    "prenet_dropout": true,
    "attention_type": "original",
    "windowing": false,
    "use_forward_attn": false,
    "forward_attn_mask": false,
    "transition_agent": false,
    "location_attn": true,
    "bidirectional_decoder": false,
    "double_decoder_consistency": true,
    "ddc_r": 7,
    "attention_heads": 4,
    "attention_norm": "sigmoid",
    "r": 7,
    "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]],
    "stopnet": true,
    "separate_stopnet": true,
    "print_step": 25,
    "tb_plot_step": 100,
    "print_eval": false,
    "save_step": 10000,
    "checkpoint": true,
    "keep_all_best": false,
    "keep_after": 10000,
    "tb_model_param_stats": false,
    "text_cleaner": "basic_cleaners",
    "enable_eos_bos_chars": false,
    "num_loader_workers": 4,
    "num_val_loader_workers": 4,
    "batch_group_size": 4,
    "min_seq_len": 6,
    "max_seq_len": 153,
    "compute_input_seq_cache": false,
    "use_noise_augment": true,
    "output_path": "DEFINE THIS",
    "phoneme_cache_path": "DEFINE THIS",
    "use_phonemes": true,
    "phoneme_language": "ja-jp",
    "characters": {
        "pad": "_",
        "eos": "~",
        "bos": "^",
        "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
        "punctuations": "!'(),-.:;? ",
        "phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
    },
    "use_speaker_embedding": false,
    "use_gst": false,       			
    "use_external_speaker_embedding_file": false,
    "external_speaker_embedding_file": "../../speakers-vctk-en.json"
 }
--- a/requirements.txt
+++ b/requirements.txt
@ -19,3 +19,5 @@ numba==0.52
 umap-learn==0.4.6
 anyascii
 coqpit
 mecab-python3
 unidic-lite
--- a/tests/tts_tests/test_japanese_phonemizer.py
+++ b/tests/tts_tests/test_japanese_phonemizer.py
@ -0,0 +1,22 @@
 import unittest
 from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
 _TEST_CASES = '''
 どちらに行きますか？/dochiraniikimasuka?
 今日は温泉に、行きます。/kyo:waoNseNni,ikimasu.
 「A」から「Z」までです。/AkaraZmadedesu.
 そうですね！/so:desune!
 クジラは哺乳類です。/kujirawahonyu:ruidesu.
 ヴィディオを見ます。/bidioomimasu.
 ky o: w a o N s e N n i , i k i m a s u ./kyo:waoNseNni,ikimasu.
 '''
 class TestText(unittest.TestCase):
    def test_japanese_text_to_phonemes(self):
        for line in _TEST_CASES.strip().split('\n'):
            text, phone = line.split('/')
            self.assertEqual(japanese_text_to_phonemes(text), phone)
 if __name__ == '__main__':
    unittest.main()
`@ -1 +1 @@`
	`__version__ = "0.0.14.1"`	`__version__ = "0.0.14"`