From 5482a0f62d3da9821b4689f4e9268580904a081f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 19 May 2021 14:00:44 +0200 Subject: [PATCH 01/36] type def for gradual_training --- TTS/tts/configs/tacotron_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index d3a54269..a567cd88 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -122,7 +122,7 @@ class TacotronConfig(BaseTTSConfig): gst_style_input: str = None # model specific params r: int = 2 - gradual_training: List[List] = None + gradual_training: List[List[int]] = None memory_size: int = -1 prenet_type: str = "original" prenet_dropout: bool = True From b8f50d3d86f77aeba453ff6deabaf447215a5331 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 21 May 2021 00:30:39 +0200 Subject: [PATCH 02/36] replace unidecode with anyascii --- TTS/tts/utils/text/cleaners.py | 16 ++-------------- requirements.txt | 2 +- tests/tts_tests/test_glow_tts_train.py | 2 +- tests/tts_tests/test_speedy_speech_train.py | 2 +- tests/vocoder_tests/test_melgan_train.py | 1 + 5 files changed, 6 insertions(+), 17 deletions(-) diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 2eddcdb8..3d2caa97 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -1,18 +1,6 @@ -""" -Cleaners are transformations that run over the input text at both training and eval time. - -Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" -hyperparameter. Some cleaners are English-specific. You'll typically want to use: - 1. "english_cleaners" for English text - 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using - the Unidecode library (https://pypi.python.org/pypi/Unidecode) - 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update - the symbols in symbols.py to match your data). -""" - import re -from unidecode import unidecode +from anyascii import anyascii from TTS.tts.utils.text.chinese_mandarin.numbers import replace_numbers_to_characters_in_text @@ -47,7 +35,7 @@ def collapse_whitespace(text): def convert_to_ascii(text): - return unidecode(text) + return anyascii(text) def remove_aux_symbols(text): diff --git a/requirements.txt b/requirements.txt index c6ce7672..b376eb1b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,5 +17,5 @@ torch>=1.7 tqdm numba==0.52 umap-learn==0.4.6 -unidecode==0.4.20 +anyascii coqpit diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py index 00c7e852..2e675d13 100644 --- a/tests/tts_tests/test_glow_tts_train.py +++ b/tests/tts_tests/test_glow_tts_train.py @@ -17,7 +17,7 @@ config = GlowTTSConfig( text_cleaner="english_cleaners", use_phonemes=True, phoneme_language="zh-CN", - phoneme_cache_path='tests/data/ljspeech/phoneme_cache/', + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, test_delay_epochs=-1, epochs=1, diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index cc2845c2..3f508117 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -17,7 +17,7 @@ config = SpeedySpeechConfig( text_cleaner="english_cleaners", use_phonemes=True, phoneme_language="zh-CN", - phoneme_cache_path='tests/data/ljspeech/phoneme_cache/', + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, test_delay_epochs=-1, epochs=1, diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py index de48ca24..3ff65b5a 100644 --- a/tests/vocoder_tests/test_melgan_train.py +++ b/tests/vocoder_tests/test_melgan_train.py @@ -19,6 +19,7 @@ config = MelganConfig( seq_len=2048, eval_split_size=1, print_step=1, + discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, print_eval=True, data_path="tests/data/ljspeech", output_path=output_path, From 0536aa6d0f41b125dd96811a8a5b04fac70d6652 Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Sat, 22 May 2021 17:12:19 +0900 Subject: [PATCH 03/36] Japanese Tacotron 2 model --- TTS/tts/configs/kokoro_tacotron2.json | 173 ++++++++++++ TTS/tts/datasets/preprocess.py | 14 + TTS/tts/utils/japanese/__init__.py | 1 + TTS/tts/utils/japanese/text.py | 380 ++++++++++++++++++++++++++ TTS/tts/utils/japanese/text_test.py | 22 ++ TTS/tts/utils/text/__init__.py | 5 + requirements.txt | 2 + 7 files changed, 597 insertions(+) create mode 100644 TTS/tts/configs/kokoro_tacotron2.json create mode 100644 TTS/tts/utils/japanese/__init__.py create mode 100644 TTS/tts/utils/japanese/text.py create mode 100644 TTS/tts/utils/japanese/text_test.py diff --git a/TTS/tts/configs/kokoro_tacotron2.json b/TTS/tts/configs/kokoro_tacotron2.json new file mode 100644 index 00000000..f5d41194 --- /dev/null +++ b/TTS/tts/configs/kokoro_tacotron2.json @@ -0,0 +1,173 @@ +{ + "model": "Tacotron2", + "run_name": "kokoro-ddc", + "run_description": "tacotron2 with DDC and differential spectral loss.", + + // AUDIO PARAMETERS + "audio":{ + // stft parameters + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // Griffin-Lim + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 1, + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": "./scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // VOCABULARY PARAMETERS + // if custom character set is not defined, + // default set in symbols.py is used + "characters":{ + "pad": "_", + "eos": "~", + "bos": "^", + "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + "punctuations": "!'(),-.:;? ", + "phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + }, + + // DISTRIBUTED TRAINING + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size":16, + "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. + "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. + "mixed_precision": true, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate. + + // LOSS SETTINGS + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "decoder_loss_alpha": 0.5, // original decoder loss weight. If > 0, it is enabled + "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled + "postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled + "postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled + "ga_alpha": 5.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples. + + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // OPTIMIZER + "noam_schedule": false, // use noam warmup and lr schedule. + "grad_clip": 1.0, // upper limit for gradients for clipping. + "epochs": 1000, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "wd": 0.000001, // Weight decay weight. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. + + // TACOTRON PRENET + "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. + "prenet_type": "original", // "original" or "bn". + "prenet_dropout": true, // enable/disable dropout at prenet. + + // TACOTRON ATTENTION + "attention_type": "original", // 'original' , 'graves', 'dynamic_convolution' + "attention_heads": 4, // number of attention heads (only for 'graves') + "attention_norm": "sigmoid", // softmax or sigmoid. + "windowing": false, // Enables attention windowing. Used only in eval mode. + "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. + "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. + "double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ + "ddc_r": 7, // reduction rate for coarse decoder. + + // STOPNET + "stopnet": true, // Train stopnet predicting the end of synthesis. + "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log training on console. + "tb_plot_step": 100, // Number of steps to plot TB training figures. + "print_eval": false, // If True, it prints intermediate loss values in evalulation. + "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "keep_all_best": false, // If true, keeps all best_models after keep_after steps + "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "text_cleaner": "basic_cleaners", + "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "batch_group_size": 4, //Number of batches to shuffle after bucketing. + "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training + "max_seq_len": 153, // DATASET-RELATED: maximum text length + "compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage. + "use_noise_augment": true, + + // PATHS + "output_path": "./Models/Kokoro/", + + // PHONEMES + "phoneme_cache_path": "./phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder. + "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. + "phoneme_language": "ja-jp", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages + + // MULTI-SPEAKER and GST + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + "use_gst": false, // use global style tokens + "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "gst": { // gst parameter if gst is enabled + "gst_style_input": null, // Condition the style input either on a + // -> wave file [path to wave] or + // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} + // with the dictionary being len(dict) <= len(gst_style_tokens). + "gst_embedding_dim": 512, + "gst_num_heads": 4, + "gst_style_tokens": 10, + "gst_use_speaker_embedding": false + }, + + // DATASETS + "datasets": // List of datasets. They all merged and they get different speaker_ids. + [ + { + "name": "kokoro", + "path": "./kokoro-speech-v1_1-small/", + "meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers + "meta_file_val": null + } + ] +} diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 72ab160e..271b1734 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -424,3 +424,17 @@ def baker(root_path: str, meta_file: str) -> List[List[str]]: wav_path = os.path.join(root_path, "clips_22", wav_name) items.append([text, wav_path, speaker_name]) return items + + +def kokoro(root_path, meta_file): + """Japanese single-speaker dataset from https://github.com/kaiidams/Kokoro-Speech-Dataset""" + txt_file = os.path.join(root_path, meta_file) + items = [] + speaker_name = "kokoro" + with open(txt_file, "r") as ttf: + for line in ttf: + cols = line.split("|") + wav_file = os.path.join(root_path, "wavs", cols[0] + '.wav') + text = cols[2].replace(" ", "") + items.append([text, wav_file, speaker_name]) + return items diff --git a/TTS/tts/utils/japanese/__init__.py b/TTS/tts/utils/japanese/__init__.py new file mode 100644 index 00000000..0ce7a99d --- /dev/null +++ b/TTS/tts/utils/japanese/__init__.py @@ -0,0 +1 @@ +from .text import japanese_text2phone \ No newline at end of file diff --git a/TTS/tts/utils/japanese/text.py b/TTS/tts/utils/japanese/text.py new file mode 100644 index 00000000..4c8936ac --- /dev/null +++ b/TTS/tts/utils/japanese/text.py @@ -0,0 +1,380 @@ +# Convert Japanese text to phonemes which is +# compatible with Julius https://github.com/julius-speech/segmentation-kit + +import re +import MeCab +from typing import List, Tuple + +_CONVRULES = [ + # Conversion of 2 letters + 'アァ/ a a', + 'イィ/ i i', + 'イェ/ i e', + 'イャ/ y a', + 'ウゥ/ u:', + 'エェ/ e e', + 'オォ/ o:', + 'カァ/ k a:', + 'キィ/ k i:', + 'クゥ/ k u:', + 'クャ/ ky a', + 'クュ/ ky u', + 'クョ/ ky o', + 'ケェ/ k e:', + 'コォ/ k o:', + 'ガァ/ g a:', + 'ギィ/ g i:', + 'グゥ/ g u:', + 'グャ/ gy a', + 'グュ/ gy u', + 'グョ/ gy o', + 'ゲェ/ g e:', + 'ゴォ/ g o:', + 'サァ/ s a:', + 'シィ/ sh i:', + 'スゥ/ s u:', + 'スャ/ sh a', + 'スュ/ sh u', + 'スョ/ sh o', + 'セェ/ s e:', + 'ソォ/ s o:', + 'ザァ/ z a:', + 'ジィ/ j i:', + 'ズゥ/ z u:', + 'ズャ/ zy a', + 'ズュ/ zy u', + 'ズョ/ zy o', + 'ゼェ/ z e:', + 'ゾォ/ z o:', + 'タァ/ t a:', + 'チィ/ ch i:', + 'ツァ/ ts a', + 'ツィ/ ts i', + 'ツゥ/ ts u:', + 'ツャ/ ch a', + 'ツュ/ ch u', + 'ツョ/ ch o', + 'ツェ/ ts e', + 'ツォ/ ts o', + 'テェ/ t e:', + 'トォ/ t o:', + 'ダァ/ d a:', + 'ヂィ/ j i:', + 'ヅゥ/ d u:', + 'ヅャ/ zy a', + 'ヅュ/ zy u', + 'ヅョ/ zy o', + 'デェ/ d e:', + 'ドォ/ d o:', + 'ナァ/ n a:', + 'ニィ/ n i:', + 'ヌゥ/ n u:', + 'ヌャ/ ny a', + 'ヌュ/ ny u', + 'ヌョ/ ny o', + 'ネェ/ n e:', + 'ノォ/ n o:', + 'ハァ/ h a:', + 'ヒィ/ h i:', + 'フゥ/ f u:', + 'フャ/ hy a', + 'フュ/ hy u', + 'フョ/ hy o', + 'ヘェ/ h e:', + 'ホォ/ h o:', + 'バァ/ b a:', + 'ビィ/ b i:', + 'ブゥ/ b u:', + 'フャ/ hy a', + 'ブュ/ by u', + 'フョ/ hy o', + 'ベェ/ b e:', + 'ボォ/ b o:', + 'パァ/ p a:', + 'ピィ/ p i:', + 'プゥ/ p u:', + 'プャ/ py a', + 'プュ/ py u', + 'プョ/ py o', + 'ペェ/ p e:', + 'ポォ/ p o:', + 'マァ/ m a:', + 'ミィ/ m i:', + 'ムゥ/ m u:', + 'ムャ/ my a', + 'ムュ/ my u', + 'ムョ/ my o', + 'メェ/ m e:', + 'モォ/ m o:', + 'ヤァ/ y a:', + 'ユゥ/ y u:', + 'ユャ/ y a:', + 'ユュ/ y u:', + 'ユョ/ y o:', + 'ヨォ/ y o:', + 'ラァ/ r a:', + 'リィ/ r i:', + 'ルゥ/ r u:', + 'ルャ/ ry a', + 'ルュ/ ry u', + 'ルョ/ ry o', + 'レェ/ r e:', + 'ロォ/ r o:', + 'ワァ/ w a:', + 'ヲォ/ o:', + 'ディ/ d i', + 'デェ/ d e:', + 'デャ/ dy a', + 'デュ/ dy u', + 'デョ/ dy o', + 'ティ/ t i', + 'テェ/ t e:', + 'テャ/ ty a', + 'テュ/ ty u', + 'テョ/ ty o', + 'スィ/ s i', + 'ズァ/ z u a', + 'ズィ/ z i', + 'ズゥ/ z u', + 'ズャ/ zy a', + 'ズュ/ zy u', + 'ズョ/ zy o', + 'ズェ/ z e', + 'ズォ/ z o', + 'キャ/ ky a', + 'キュ/ ky u', + 'キョ/ ky o', + 'シャ/ sh a', + 'シュ/ sh u', + 'シェ/ sh e', + 'ショ/ sh o', + 'チャ/ ch a', + 'チュ/ ch u', + 'チェ/ ch e', + 'チョ/ ch o', + 'トゥ/ t u', + 'トャ/ ty a', + 'トュ/ ty u', + 'トョ/ ty o', + 'ドァ/ d o a', + 'ドゥ/ d u', + 'ドャ/ dy a', + 'ドュ/ dy u', + 'ドョ/ dy o', + 'ドォ/ d o:', + 'ニャ/ ny a', + 'ニュ/ ny u', + 'ニョ/ ny o', + 'ヒャ/ hy a', + 'ヒュ/ hy u', + 'ヒョ/ hy o', + 'ミャ/ my a', + 'ミュ/ my u', + 'ミョ/ my o', + 'リャ/ ry a', + 'リュ/ ry u', + 'リョ/ ry o', + 'ギャ/ gy a', + 'ギュ/ gy u', + 'ギョ/ gy o', + 'ヂェ/ j e', + 'ヂャ/ j a', + 'ヂュ/ j u', + 'ヂョ/ j o', + 'ジェ/ j e', + 'ジャ/ j a', + 'ジュ/ j u', + 'ジョ/ j o', + 'ビャ/ by a', + 'ビュ/ by u', + 'ビョ/ by o', + 'ピャ/ py a', + 'ピュ/ py u', + 'ピョ/ py o', + 'ウァ/ u a', + 'ウィ/ w i', + 'ウェ/ w e', + 'ウォ/ w o', + 'ファ/ f a', + 'フィ/ f i', + 'フゥ/ f u', + 'フャ/ hy a', + 'フュ/ hy u', + 'フョ/ hy o', + 'フェ/ f e', + 'フォ/ f o', + 'ヴァ/ b a', + 'ヴィ/ b i', + 'ヴェ/ b e', + 'ヴォ/ b o', + 'ヴュ/ by u', + + # Conversion of 1 letter + 'ア/ a', + 'イ/ i', + 'ウ/ u', + 'エ/ e', + 'オ/ o', + 'カ/ k a', + 'キ/ k i', + 'ク/ k u', + 'ケ/ k e', + 'コ/ k o', + 'サ/ s a', + 'シ/ sh i', + 'ス/ s u', + 'セ/ s e', + 'ソ/ s o', + 'タ/ t a', + 'チ/ ch i', + 'ツ/ ts u', + 'テ/ t e', + 'ト/ t o', + 'ナ/ n a', + 'ニ/ n i', + 'ヌ/ n u', + 'ネ/ n e', + 'ノ/ n o', + 'ハ/ h a', + 'ヒ/ h i', + 'フ/ f u', + 'ヘ/ h e', + 'ホ/ h o', + 'マ/ m a', + 'ミ/ m i', + 'ム/ m u', + 'メ/ m e', + 'モ/ m o', + 'ラ/ r a', + 'リ/ r i', + 'ル/ r u', + 'レ/ r e', + 'ロ/ r o', + 'ガ/ g a', + 'ギ/ g i', + 'グ/ g u', + 'ゲ/ g e', + 'ゴ/ g o', + 'ザ/ z a', + 'ジ/ j i', + 'ズ/ z u', + 'ゼ/ z e', + 'ゾ/ z o', + 'ダ/ d a', + 'ヂ/ j i', + 'ヅ/ z u', + 'デ/ d e', + 'ド/ d o', + 'バ/ b a', + 'ビ/ b i', + 'ブ/ b u', + 'ベ/ b e', + 'ボ/ b o', + 'パ/ p a', + 'ピ/ p i', + 'プ/ p u', + 'ペ/ p e', + 'ポ/ p o', + 'ヤ/ y a', + 'ユ/ y u', + 'ヨ/ y o', + 'ワ/ w a', + 'ヰ/ i', + 'ヱ/ e', + 'ヲ/ o', + 'ン/ N', + 'ッ/ q', + 'ヴ/ b u', + 'ー/:', + + # Try converting broken text + 'ァ/ a', + 'ィ/ i', + 'ゥ/ u', + 'ェ/ e', + 'ォ/ o', + 'ヮ/ w a', + 'ォ/ o', + + # Symbols + '、/ ,', + '。/ .', + '!/ !', + '?/ ?', + '・/ ,' +] + +_COLON_RX = re.compile(':+') +_REJECT_RX = re.compile('[^ a-zA-Z:,.?]') + +def _makerulemap(): + l = [tuple(x.split('/')) for x in _CONVRULES] + return tuple( + {k: v for k, v in l if len(k) == i} + for i in (1, 2) + ) + +_RULEMAP1, _RULEMAP2 = _makerulemap() + +def kata2phoneme(text: str) -> str: + """Convert katakana text to phonemes. + """ + text = text.strip() + res = '' + while text: + if len(text) >= 2: + x = _RULEMAP2.get(text[:2]) + if x is not None: + text = text[2:] + res += x + continue + x = _RULEMAP1.get(text[0]) + if x is not None: + text = text[1:] + res += x + continue + res += ' ' + text[0] + text = text[1:] + res = _COLON_RX.sub(':', res) + return res[1:] + +_KATAKANA = ''.join(chr(ch) for ch in range(ord('ァ'), ord('ン') + 1)) +_HIRAGANA = ''.join(chr(ch) for ch in range(ord('ぁ'), ord('ん') + 1)) +_HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA) + +def hira2kata(text: str) -> str: + text = text.translate(_HIRA2KATATRANS) + return text.replace('う゛', 'ヴ') + +_SYMBOL_TOKENS = set(list('・、。?!')) +_NO_YOMI_TOKENS = set(list('「」『』―()[][] …')) +_TAGGER = MeCab.Tagger() + +def text2kata(text: str) -> str: + parsed = _TAGGER.parse(text) + res = [] + for line in parsed.split('\n'): + if line == 'EOS': + break + parts = line.split('\t') + + word, yomi = parts[0], parts[1] + if yomi: + res.append(yomi) + else: + if word in _SYMBOL_TOKENS: + res.append(word) + elif word == 'っ' or word == 'ッ': + res.append('ッ') + elif word in _NO_YOMI_TOKENS: + pass + else: + res.append(word) + return hira2kata(''.join(res)) + +def japanese_text2phone(text: str) -> str: + """Convert Japanese text to phonemes. + """ + res = text2kata(text) + res = kata2phoneme(res) + return res.replace(' ', '') \ No newline at end of file diff --git a/TTS/tts/utils/japanese/text_test.py b/TTS/tts/utils/japanese/text_test.py new file mode 100644 index 00000000..7a04925a --- /dev/null +++ b/TTS/tts/utils/japanese/text_test.py @@ -0,0 +1,22 @@ +import unittest +from . import japanese_text2phone + +_TEST_CASES = ''' +どちらに行きますか?/dochiraniikimasuka? +今日は温泉に、行きます。/kyo:waoNseNni,ikimasu. +「A」から「Z」までです。/AkaraZmadedesu. +そうですね!/so:desune! +クジラは哺乳類です。/kujirawahonyu:ruidesu. +ヴィディオを見ます。/bidioomimasu. +ky o: w a o N s e N n i , i k i m a s u ./kyo:waoNseNni,ikimasu. +''' + +class TestText(unittest.TestCase): + + def test_text2phone(self): + for line in _TEST_CASES.strip().split('\n'): + text, phone = line.split('/') + self.assertEqual(japanese_text2phone(text), phone) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 9367e6e2..9b63e7f1 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -39,6 +39,11 @@ def text2phone(text, language): if language == "zh-CN": ph = chinese_text_to_phonemes(text) return ph + elif language == "ja-jp": + from TTS.tts.utils.japanese import japanese_text2phone + ph = japanese_text2phone(text) + return ph + raise ValueError(f" [!] Language {language} is not supported for phonemization.") diff --git a/requirements.txt b/requirements.txt index c6ce7672..7f45f9e0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,5 @@ numba==0.52 umap-learn==0.4.6 unidecode==0.4.20 coqpit +mecab-python3 +unidic-lite From f921a05bdb6ce6fc950c290b8a3aec613a7f70fe Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Wed, 26 May 2021 19:02:16 +0900 Subject: [PATCH 04/36] Fixed lint errors --- TTS/tts/utils/japanese/__init__.py | 2 +- TTS/tts/utils/japanese/text.py | 5 ++--- TTS/tts/utils/japanese/text_test.py | 2 +- TTS/tts/utils/text/__init__.py | 3 ++- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/TTS/tts/utils/japanese/__init__.py b/TTS/tts/utils/japanese/__init__.py index 0ce7a99d..30d963e8 100644 --- a/TTS/tts/utils/japanese/__init__.py +++ b/TTS/tts/utils/japanese/__init__.py @@ -1 +1 @@ -from .text import japanese_text2phone \ No newline at end of file +from .text import japanese_text2phone diff --git a/TTS/tts/utils/japanese/text.py b/TTS/tts/utils/japanese/text.py index 4c8936ac..3a705352 100644 --- a/TTS/tts/utils/japanese/text.py +++ b/TTS/tts/utils/japanese/text.py @@ -3,7 +3,6 @@ import re import MeCab -from typing import List, Tuple _CONVRULES = [ # Conversion of 2 letters @@ -364,7 +363,7 @@ def text2kata(text: str) -> str: else: if word in _SYMBOL_TOKENS: res.append(word) - elif word == 'っ' or word == 'ッ': + elif word in ('っ', 'ッ'): res.append('ッ') elif word in _NO_YOMI_TOKENS: pass @@ -377,4 +376,4 @@ def japanese_text2phone(text: str) -> str: """ res = text2kata(text) res = kata2phoneme(res) - return res.replace(' ', '') \ No newline at end of file + return res.replace(' ', '') diff --git a/TTS/tts/utils/japanese/text_test.py b/TTS/tts/utils/japanese/text_test.py index 7a04925a..d3ade826 100644 --- a/TTS/tts/utils/japanese/text_test.py +++ b/TTS/tts/utils/japanese/text_test.py @@ -19,4 +19,4 @@ class TestText(unittest.TestCase): self.assertEqual(japanese_text2phone(text), phone) if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 9b63e7f1..d7423102 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -39,7 +39,8 @@ def text2phone(text, language): if language == "zh-CN": ph = chinese_text_to_phonemes(text) return ph - elif language == "ja-jp": + + if language == "ja-jp": from TTS.tts.utils.japanese import japanese_text2phone ph = japanese_text2phone(text) return ph From c6f22aaa67f98aebbf8900c9244b4814e80bac86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 11:46:33 +0200 Subject: [PATCH 05/36] fix #509 --- TTS/tts/configs/shared_configs.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index 6c710ca2..4690e76f 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -80,12 +80,12 @@ class CharactersConfig(Coqpit): ): """Check config fields""" c = asdict(self) - check_argument("pad", c, "characters", restricted=True) - check_argument("eos", c, "characters", restricted=True) - check_argument("bos", c, "characters", restricted=True) - check_argument("characters", c, "characters", restricted=True) + check_argument("pad", c, prerequest="characters", restricted=True) + check_argument("eos", c, prerequest="characters", restricted=True) + check_argument("bos", c, prerequest="characters", restricted=True) + check_argument("characters", c, prerequest="characters", restricted=True) check_argument("phonemes", c, restricted=True) - check_argument("punctuations", c, "characters", restricted=True) + check_argument("punctuations", c, prerequest="characters", restricted=True) @dataclass From e08c58db3b23a832b68d5aa605ff5d0a308e61cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 13:11:01 +0200 Subject: [PATCH 06/36] bump up version to v0.14.1 --- TTS/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/_version.py b/TTS/_version.py index 311f216e..f4956698 100644 --- a/TTS/_version.py +++ b/TTS/_version.py @@ -1 +1 @@ -__version__ = "0.0.14" +__version__ = "0.0.14.1" From 925c08cf95386c936e7e7f979f6b536b2440ec5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 21 May 2021 00:30:39 +0200 Subject: [PATCH 07/36] replace unidecode with anyascii --- TTS/tts/utils/text/cleaners.py | 16 ++-------------- requirements.txt | 2 +- tests/tts_tests/test_glow_tts_train.py | 2 +- tests/tts_tests/test_speedy_speech_train.py | 2 +- tests/vocoder_tests/test_melgan_train.py | 1 + 5 files changed, 6 insertions(+), 17 deletions(-) diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 2eddcdb8..3d2caa97 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -1,18 +1,6 @@ -""" -Cleaners are transformations that run over the input text at both training and eval time. - -Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" -hyperparameter. Some cleaners are English-specific. You'll typically want to use: - 1. "english_cleaners" for English text - 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using - the Unidecode library (https://pypi.python.org/pypi/Unidecode) - 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update - the symbols in symbols.py to match your data). -""" - import re -from unidecode import unidecode +from anyascii import anyascii from TTS.tts.utils.text.chinese_mandarin.numbers import replace_numbers_to_characters_in_text @@ -47,7 +35,7 @@ def collapse_whitespace(text): def convert_to_ascii(text): - return unidecode(text) + return anyascii(text) def remove_aux_symbols(text): diff --git a/requirements.txt b/requirements.txt index c6ce7672..b376eb1b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,5 +17,5 @@ torch>=1.7 tqdm numba==0.52 umap-learn==0.4.6 -unidecode==0.4.20 +anyascii coqpit diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py index 00c7e852..2e675d13 100644 --- a/tests/tts_tests/test_glow_tts_train.py +++ b/tests/tts_tests/test_glow_tts_train.py @@ -17,7 +17,7 @@ config = GlowTTSConfig( text_cleaner="english_cleaners", use_phonemes=True, phoneme_language="zh-CN", - phoneme_cache_path='tests/data/ljspeech/phoneme_cache/', + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, test_delay_epochs=-1, epochs=1, diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index cc2845c2..3f508117 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -17,7 +17,7 @@ config = SpeedySpeechConfig( text_cleaner="english_cleaners", use_phonemes=True, phoneme_language="zh-CN", - phoneme_cache_path='tests/data/ljspeech/phoneme_cache/', + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, test_delay_epochs=-1, epochs=1, diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py index de48ca24..3ff65b5a 100644 --- a/tests/vocoder_tests/test_melgan_train.py +++ b/tests/vocoder_tests/test_melgan_train.py @@ -19,6 +19,7 @@ config = MelganConfig( seq_len=2048, eval_split_size=1, print_step=1, + discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, print_eval=True, data_path="tests/data/ljspeech", output_path=output_path, From c4987e9d4e503628df5661c0945b2047ea046b1f Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Fri, 28 May 2021 00:22:57 +0900 Subject: [PATCH 08/36] Move import at the head of the file. --- TTS/tts/utils/text/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index d7423102..f6b46783 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -6,6 +6,7 @@ from packaging import version from TTS.tts.utils.text import cleaners from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes +from TTS.tts.utils.japanese import japanese_text2phone from TTS.tts.utils.text.symbols import _bos, _eos, _punctuations, make_symbols, phonemes, symbols # pylint: disable=unnecessary-comprehension @@ -41,7 +42,6 @@ def text2phone(text, language): return ph if language == "ja-jp": - from TTS.tts.utils.japanese import japanese_text2phone ph = japanese_text2phone(text) return ph From d0c9c1ca5c28d37845ea7a19d399851c5bfd5429 Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Sat, 29 May 2021 09:21:47 +0900 Subject: [PATCH 09/36] Move TTS/tts/utils/japanese --- TTS/tts/utils/japanese/__init__.py | 1 - TTS/tts/utils/text/__init__.py | 4 ++-- TTS/tts/utils/text/japanese/__init__.py | 0 .../utils/{japanese/text.py => text/japanese/phonemizer.py} | 2 +- .../text_test.py => text/japanese/phonemizer_test.py} | 6 +++--- 5 files changed, 6 insertions(+), 7 deletions(-) delete mode 100644 TTS/tts/utils/japanese/__init__.py create mode 100644 TTS/tts/utils/text/japanese/__init__.py rename TTS/tts/utils/{japanese/text.py => text/japanese/phonemizer.py} (99%) rename TTS/tts/utils/{japanese/text_test.py => text/japanese/phonemizer_test.py} (77%) diff --git a/TTS/tts/utils/japanese/__init__.py b/TTS/tts/utils/japanese/__init__.py deleted file mode 100644 index 30d963e8..00000000 --- a/TTS/tts/utils/japanese/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .text import japanese_text2phone diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index f6b46783..f9f44167 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -6,7 +6,7 @@ from packaging import version from TTS.tts.utils.text import cleaners from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes -from TTS.tts.utils.japanese import japanese_text2phone +from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes from TTS.tts.utils.text.symbols import _bos, _eos, _punctuations, make_symbols, phonemes, symbols # pylint: disable=unnecessary-comprehension @@ -42,7 +42,7 @@ def text2phone(text, language): return ph if language == "ja-jp": - ph = japanese_text2phone(text) + ph = japanese_text_to_phonemes(text) return ph raise ValueError(f" [!] Language {language} is not supported for phonemization.") diff --git a/TTS/tts/utils/text/japanese/__init__.py b/TTS/tts/utils/text/japanese/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/TTS/tts/utils/japanese/text.py b/TTS/tts/utils/text/japanese/phonemizer.py similarity index 99% rename from TTS/tts/utils/japanese/text.py rename to TTS/tts/utils/text/japanese/phonemizer.py index 3a705352..f09d5b05 100644 --- a/TTS/tts/utils/japanese/text.py +++ b/TTS/tts/utils/text/japanese/phonemizer.py @@ -371,7 +371,7 @@ def text2kata(text: str) -> str: res.append(word) return hira2kata(''.join(res)) -def japanese_text2phone(text: str) -> str: +def japanese_text_to_phonemes(text: str) -> str: """Convert Japanese text to phonemes. """ res = text2kata(text) diff --git a/TTS/tts/utils/japanese/text_test.py b/TTS/tts/utils/text/japanese/phonemizer_test.py similarity index 77% rename from TTS/tts/utils/japanese/text_test.py rename to TTS/tts/utils/text/japanese/phonemizer_test.py index d3ade826..f07c0901 100644 --- a/TTS/tts/utils/japanese/text_test.py +++ b/TTS/tts/utils/text/japanese/phonemizer_test.py @@ -1,5 +1,5 @@ import unittest -from . import japanese_text2phone +from .phonemizer import japanese_text_to_phonemes _TEST_CASES = ''' どちらに行きますか?/dochiraniikimasuka? @@ -13,10 +13,10 @@ ky o: w a o N s e N n i , i k i m a s u ./kyo:waoNseNni,ikimasu. class TestText(unittest.TestCase): - def test_text2phone(self): + def test_japanese_text_to_phonemes(self): for line in _TEST_CASES.strip().split('\n'): text, phone = line.split('/') - self.assertEqual(japanese_text2phone(text), phone) + self.assertEqual(japanese_text_to_phonemes(text), phone) if __name__ == '__main__': unittest.main() From 29d61741ecdc9c377cf3ff3bda622233304e7127 Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Sat, 29 May 2021 19:03:23 +0900 Subject: [PATCH 10/36] Copied recipe --- recipes/kokoro/tacotron2-DDC/run.sh | 22 +++++ .../kokoro/tacotron2-DDC/tacotron2-DDC.json | 91 +++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 recipes/kokoro/tacotron2-DDC/run.sh create mode 100644 recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json diff --git a/recipes/kokoro/tacotron2-DDC/run.sh b/recipes/kokoro/tacotron2-DDC/run.sh new file mode 100644 index 00000000..eaa05b60 --- /dev/null +++ b/recipes/kokoro/tacotron2-DDC/run.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# take the scripts's parent's directory to prefix all the output paths. +RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +echo $RUN_DIR +# download LJSpeech dataset +wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 +# extract +tar -xjf LJSpeech-1.1.tar.bz2 +# create train-val splits +shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv +head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv +tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv +mv LJSpeech-1.1 $RUN_DIR/ +rm LJSpeech-1.1.tar.bz2 +# compute dataset mean and variance for normalization +python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/LJSpeech-1.1/wavs/ +# training .... +# change the GPU id if needed +CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tacotron.py --config_path $RUN_DIR/tacotron2-DDC.json \ + --coqpit.output_path $RUN_DIR \ + --coqpit.datasets.0.path $RUN_DIR/LJSpeech-1.1/ \ + --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ \ No newline at end of file diff --git a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json new file mode 100644 index 00000000..9cdbbd3b --- /dev/null +++ b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json @@ -0,0 +1,91 @@ +{ + "datasets": [ + { + "name": "ljspeech", + "path": "DEFINE THIS", + "meta_file_train": "metadata.csv", + "meta_file_val": null + } + ], + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_length_ms": null, + "frame_shift_ms": null, + "sample_rate": 22050, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_trim_silence": true, + "trim_db": 60, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 50.0, + "mel_fmax": 7600.0, + "spec_gain": 1, + "signal_norm": true, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": "scale_stats.npy" + }, + "gst":{ + "gst_embedding_dim": 256, + "gst_num_heads": 4, + "gst_num_style_tokens": 10 + }, + "model": "Tacotron2", + "run_name": "ljspeech-ddc", + "run_description": "tacotron2 with double decoder consistency.", + "batch_size": 64, + "eval_batch_size": 16, + "mixed_precision": true, + "loss_masking": true, + "decoder_loss_alpha": 0.25, + "postnet_loss_alpha": 0.25, + "postnet_diff_spec_alpha": 0.25, + "decoder_diff_spec_alpha": 0.25, + "decoder_ssim_alpha": 0.25, + "postnet_ssim_alpha": 0.25, + "ga_alpha": 5.0, + "stopnet_pos_weight": 15.0, + "run_eval": true, + "test_delay_epochs": 10, + "test_sentences_file": null, + "noam_schedule": true, + "grad_clip": 0.05, + "epochs": 1000, + "lr": 0.001, + "wd": 1e-06, + "warmup_steps": 4000, + "memory_size": -1, + "prenet_type": "original", + "prenet_dropout": true, + "attention_type": "original", + "location_attn": true, + "double_decoder_consistency": true, + "ddc_r": 6, + "attention_norm": "sigmoid", + "r": 6, + "gradual_training": [[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]], + "stopnet": true, + "separate_stopnet": true, + "print_step": 25, + "tb_plot_step": 100, + "print_eval": false, + "save_step": 10000, + "checkpoint": true, + "text_cleaner": "phoneme_cleaners", + "num_loader_workers": 4, + "num_val_loader_workers": 4, + "batch_group_size": 4, + "min_seq_len": 6, + "max_seq_len": 180, + "compute_input_seq_cache": true, + "output_path": "DEFINE THIS", + "phoneme_cache_path": "DEFINE THIS", + "use_phonemes": false, + "phoneme_language": "en-us" +} \ No newline at end of file From c4a5a73f186c40dc80c043edf4300198781769d6 Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Sat, 29 May 2021 19:17:27 +0900 Subject: [PATCH 11/36] update Kokoro config --- TTS/tts/configs/kokoro_tacotron2.json | 173 ------------------ .../kokoro/tacotron2-DDC/tacotron2-DDC.json | 82 ++++++--- 2 files changed, 58 insertions(+), 197 deletions(-) delete mode 100644 TTS/tts/configs/kokoro_tacotron2.json diff --git a/TTS/tts/configs/kokoro_tacotron2.json b/TTS/tts/configs/kokoro_tacotron2.json deleted file mode 100644 index f5d41194..00000000 --- a/TTS/tts/configs/kokoro_tacotron2.json +++ /dev/null @@ -1,173 +0,0 @@ -{ - "model": "Tacotron2", - "run_name": "kokoro-ddc", - "run_description": "tacotron2 with DDC and differential spectral loss.", - - // AUDIO PARAMETERS - "audio":{ - // stft parameters - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - - // Audio processing parameters - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. - "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - - // Silence trimming - "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - - // Griffin-Lim - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - - // MelSpectrogram parameters - "num_mels": 80, // size of the mel spec frame. - "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! - "spec_gain": 1, - - // Normalization parameters - "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. - "min_level_db": -100, // lower bound for normalization - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "stats_path": "./scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored - }, - - // VOCABULARY PARAMETERS - // if custom character set is not defined, - // default set in symbols.py is used - "characters":{ - "pad": "_", - "eos": "~", - "bos": "^", - "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", - "punctuations": "!'(),-.:;? ", - "phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" - }, - - // DISTRIBUTED TRAINING - "distributed":{ - "backend": "nccl", - "url": "tcp:\/\/localhost:54321" - }, - - "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. - - // TRAINING - "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "eval_batch_size":16, - "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. - "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. - "mixed_precision": true, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate. - - // LOSS SETTINGS - "loss_masking": true, // enable / disable loss masking against the sequence padding. - "decoder_loss_alpha": 0.5, // original decoder loss weight. If > 0, it is enabled - "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled - "postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled - "decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled - "decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled - "postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled - "ga_alpha": 5.0, // weight for guided attention loss. If > 0, guided attention is enabled. - "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples. - - - // VALIDATION - "run_eval": true, - "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. - "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. - - // OPTIMIZER - "noam_schedule": false, // use noam warmup and lr schedule. - "grad_clip": 1.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "wd": 0.000001, // Weight decay weight. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. - - // TACOTRON PRENET - "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. - "prenet_type": "original", // "original" or "bn". - "prenet_dropout": true, // enable/disable dropout at prenet. - - // TACOTRON ATTENTION - "attention_type": "original", // 'original' , 'graves', 'dynamic_convolution' - "attention_heads": 4, // number of attention heads (only for 'graves') - "attention_norm": "sigmoid", // softmax or sigmoid. - "windowing": false, // Enables attention windowing. Used only in eval mode. - "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. - "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. - "transition_agent": false, // enable/disable transition agent of forward attention. - "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. - "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. - "double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ - "ddc_r": 7, // reduction rate for coarse decoder. - - // STOPNET - "stopnet": true, // Train stopnet predicting the end of synthesis. - "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. - - // TENSORBOARD and LOGGING - "print_step": 25, // Number of steps to log training on console. - "tb_plot_step": 100, // Number of steps to plot TB training figures. - "print_eval": false, // If True, it prints intermediate loss values in evalulation. - "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "keep_all_best": false, // If true, keeps all best_models after keep_after steps - "keep_after": 10000, // Global step after which to keep best models if keep_all_best is true - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - - // DATA LOADING - "text_cleaner": "basic_cleaners", - "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. - "batch_group_size": 4, //Number of batches to shuffle after bucketing. - "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training - "max_seq_len": 153, // DATASET-RELATED: maximum text length - "compute_input_seq_cache": false, // if true, text sequences are computed before starting training. If phonemes are enabled, they are also computed at this stage. - "use_noise_augment": true, - - // PATHS - "output_path": "./Models/Kokoro/", - - // PHONEMES - "phoneme_cache_path": "./phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder. - "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. - "phoneme_language": "ja-jp", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages - - // MULTI-SPEAKER and GST - "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. - "use_gst": false, // use global style tokens - "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 - "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 - "gst": { // gst parameter if gst is enabled - "gst_style_input": null, // Condition the style input either on a - // -> wave file [path to wave] or - // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} - // with the dictionary being len(dict) <= len(gst_style_tokens). - "gst_embedding_dim": 512, - "gst_num_heads": 4, - "gst_style_tokens": 10, - "gst_use_speaker_embedding": false - }, - - // DATASETS - "datasets": // List of datasets. They all merged and they get different speaker_ids. - [ - { - "name": "kokoro", - "path": "./kokoro-speech-v1_1-small/", - "meta_file_train": "metadata.csv", // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers - "meta_file_val": null - } - ] -} diff --git a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json index 9cdbbd3b..1aaec547 100644 --- a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json +++ b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json @@ -1,8 +1,8 @@ { "datasets": [ { - "name": "ljspeech", - "path": "DEFINE THIS", + "name": "kokoro", + "path": "./kokoro-speech-v1_1-tiny/", "meta_file_train": "metadata.csv", "meta_file_val": null } @@ -32,44 +32,61 @@ "stats_path": "scale_stats.npy" }, "gst":{ - "gst_embedding_dim": 256, + "gst_style_input": null, + + + + "gst_embedding_dim": 512, "gst_num_heads": 4, - "gst_num_style_tokens": 10 - }, + "gst_style_tokens": 10, + "gst_use_speaker_embedding": false + }, "model": "Tacotron2", - "run_name": "ljspeech-ddc", - "run_description": "tacotron2 with double decoder consistency.", - "batch_size": 64, + "run_name": "kokoro-ddc", + "run_description": "tacotron2 with DDC and differential spectral loss.", + "batch_size": 32, "eval_batch_size": 16, "mixed_precision": true, + "distributed": { + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + "reinit_layers": [], "loss_masking": true, - "decoder_loss_alpha": 0.25, + "decoder_loss_alpha": 0.5, "postnet_loss_alpha": 0.25, "postnet_diff_spec_alpha": 0.25, "decoder_diff_spec_alpha": 0.25, - "decoder_ssim_alpha": 0.25, + "decoder_ssim_alpha": 0.5, "postnet_ssim_alpha": 0.25, "ga_alpha": 5.0, "stopnet_pos_weight": 15.0, "run_eval": true, "test_delay_epochs": 10, "test_sentences_file": null, - "noam_schedule": true, - "grad_clip": 0.05, + "noam_schedule": false, + "grad_clip": 1.0, "epochs": 1000, - "lr": 0.001, - "wd": 1e-06, + "lr": 0.0001, + "wd": 0.000001, "warmup_steps": 4000, + "seq_len_norm": false, "memory_size": -1, "prenet_type": "original", "prenet_dropout": true, "attention_type": "original", + "windowing": false, + "use_forward_attn": false, + "forward_attn_mask": false, + "transition_agent": false, "location_attn": true, + "bidirectional_decoder": false, "double_decoder_consistency": true, - "ddc_r": 6, + "ddc_r": 7, + "attention_heads": 4, "attention_norm": "sigmoid", - "r": 6, - "gradual_training": [[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]], + "r": 7, + "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], "stopnet": true, "separate_stopnet": true, "print_step": 25, @@ -77,15 +94,32 @@ "print_eval": false, "save_step": 10000, "checkpoint": true, - "text_cleaner": "phoneme_cleaners", + "keep_all_best": false, + "keep_after": 10000, + "tb_model_param_stats": false, + "text_cleaner": "basic_cleaners", + "enable_eos_bos_chars": false, "num_loader_workers": 4, "num_val_loader_workers": 4, "batch_group_size": 4, "min_seq_len": 6, - "max_seq_len": 180, - "compute_input_seq_cache": true, - "output_path": "DEFINE THIS", - "phoneme_cache_path": "DEFINE THIS", - "use_phonemes": false, - "phoneme_language": "en-us" + "max_seq_len": 153, + "compute_input_seq_cache": false, + "use_noise_augment": true, + "output_path": "./Models/Kokoro/", + "phoneme_cache_path": "./phoneme_cache/", + "use_phonemes": true, + "phoneme_language": "ja-jp", + "characters": { + "pad": "_", + "eos": "~", + "bos": "^", + "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + "punctuations": "!'(),-.:;? ", + "phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + }, + "use_speaker_embedding": false, + "use_gst": false, + "use_external_speaker_embedding_file": false, + "external_speaker_embedding_file": "../../speakers-vctk-en.json" } \ No newline at end of file From 88f3255962073d84d1c7d559b956a0330a6fd11d Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Sat, 29 May 2021 19:39:51 +0900 Subject: [PATCH 12/36] Update Kokoro recipe --- recipes/kokoro/tacotron2-DDC/run.sh | 27 ++++++++++--------- .../kokoro/tacotron2-DDC/tacotron2-DDC.json | 6 ++--- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/recipes/kokoro/tacotron2-DDC/run.sh b/recipes/kokoro/tacotron2-DDC/run.sh index eaa05b60..cd2aaff5 100644 --- a/recipes/kokoro/tacotron2-DDC/run.sh +++ b/recipes/kokoro/tacotron2-DDC/run.sh @@ -1,22 +1,23 @@ #!/bin/bash # take the scripts's parent's directory to prefix all the output paths. RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +CORPUS=kokoro-speech-v1_1-tiny echo $RUN_DIR -# download LJSpeech dataset -wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 -# extract -tar -xjf LJSpeech-1.1.tar.bz2 +if [ \! -d $RUN_DIR/$CORPUS ] ; then + echo "$RUN_DIR/$CORPUS doesn't exist." + echo "Follow the instruction of https://github.com/kaiidams/Kokoro-Speech-Dataset to make the corpus." + exit 1 +fi # create train-val splits -shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv -head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv -tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv -mv LJSpeech-1.1 $RUN_DIR/ -rm LJSpeech-1.1.tar.bz2 +shuf $RUN_DIR/$CORPUS/metadata.csv > $RUN_DIR/$CORPUS/metadata_shuf.csv +head -n 8000 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_train.csv +tail -n 812 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_val.csv # compute dataset mean and variance for normalization -python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/LJSpeech-1.1/wavs/ +python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/$CORPUS/wavs/ # training .... # change the GPU id if needed CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tacotron.py --config_path $RUN_DIR/tacotron2-DDC.json \ - --coqpit.output_path $RUN_DIR \ - --coqpit.datasets.0.path $RUN_DIR/LJSpeech-1.1/ \ - --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ \ No newline at end of file + --coqpit.output_path $RUN_DIR \ + --coqpit.datasets.0.path $RUN_DIR/$CORPUS \ + --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ + --coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \ \ No newline at end of file diff --git a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json index 1aaec547..b3630055 100644 --- a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json +++ b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json @@ -2,7 +2,7 @@ "datasets": [ { "name": "kokoro", - "path": "./kokoro-speech-v1_1-tiny/", + "path": "DEFINE THIS", "meta_file_train": "metadata.csv", "meta_file_val": null } @@ -106,8 +106,8 @@ "max_seq_len": 153, "compute_input_seq_cache": false, "use_noise_augment": true, - "output_path": "./Models/Kokoro/", - "phoneme_cache_path": "./phoneme_cache/", + "output_path": "DEFINE THIS", + "phoneme_cache_path": "DEFINE THIS", "use_phonemes": true, "phoneme_language": "ja-jp", "characters": { From 2091e808c82647787b571f1b17e80378d203e830 Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Sat, 29 May 2021 19:41:00 +0900 Subject: [PATCH 13/36] Fix path --- recipes/kokoro/tacotron2-DDC/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/kokoro/tacotron2-DDC/run.sh b/recipes/kokoro/tacotron2-DDC/run.sh index cd2aaff5..86fda642 100644 --- a/recipes/kokoro/tacotron2-DDC/run.sh +++ b/recipes/kokoro/tacotron2-DDC/run.sh @@ -1,7 +1,7 @@ #!/bin/bash # take the scripts's parent's directory to prefix all the output paths. RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -CORPUS=kokoro-speech-v1_1-tiny +CORPUS=kokoro-speech-v1_1-small echo $RUN_DIR if [ \! -d $RUN_DIR/$CORPUS ] ; then echo "$RUN_DIR/$CORPUS doesn't exist." From d9f1268f99f3a91078f9f4806e9283b615c51c03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 31 May 2021 15:47:07 +0200 Subject: [PATCH 14/36] init tb_logger None for rank > 0 processes --- TTS/utils/arguments.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 1b5a424b..5e6acd1d 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -152,6 +152,7 @@ def process_args(args): experiment_path = create_experiment_folder(config.output_path, config.run_name, args.debug) audio_path = os.path.join(experiment_path, "test_audios") # setup rank 0 process in distributed training + tb_logger = None if args.rank == 0: os.makedirs(audio_path, exist_ok=True) new_fields = {} From 975531b3f27510d3668218086923fde9f2a8562d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 31 May 2021 16:11:22 +0200 Subject: [PATCH 15/36] update `pylintrc` for torch and numpy functions --- .pylintrc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pylintrc b/.pylintrc index 0bc0be4b..1b3d64c2 100644 --- a/.pylintrc +++ b/.pylintrc @@ -253,7 +253,7 @@ contextmanager-decorators=contextlib.contextmanager # List of members which are set dynamically and missed by pylint inference # system, and so shouldn't trigger E1101 when accessed. Python regular # expressions are accepted. -generated-members= +generated-members=numpy.*,torch.* # Tells whether missing members accessed in mixin class should be ignored. A # mixin class is detected if its name ends with "mixin" (case insensitive). From bec85ac58d21536e8bbd395eac5f7b70a1618206 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 31 May 2021 16:37:15 +0200 Subject: [PATCH 16/36] make style --- TTS/bin/compute_embeddings.py | 4 +- TTS/bin/train_encoder.py | 9 +-- TTS/speaker_encoder/dataset.py | 49 +++++++------ TTS/speaker_encoder/losses.py | 10 ++- TTS/speaker_encoder/models/resnet.py | 37 ++++++---- TTS/speaker_encoder/speaker_encoder_config.py | 5 +- TTS/speaker_encoder/utils/generic_utils.py | 69 ++++++++++++------- tests/test_speaker_encoder.py | 5 ++ tests/test_speaker_encoder_train.py | 2 +- 9 files changed, 115 insertions(+), 75 deletions(-) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 003da1e5..872fc875 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -5,11 +5,11 @@ import os import torch from tqdm import tqdm +from TTS.config import BaseDatasetConfig, load_config from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor -from TTS.config import load_config, BaseDatasetConfig parser = argparse.ArgumentParser( description='Compute embedding vectors for each wav file in a dataset. If "target_dataset" is defined, it generates "speakers.json" necessary for training a multi-speaker model.' @@ -100,7 +100,7 @@ for idx, wav_file in enumerate(tqdm(wav_files)): if speaker_mapping: # save speaker_mapping if target dataset is defined - if '.json' not in args.output_path: + if ".json" not in args.output_path: mapping_file_path = os.path.join(args.output_path, "speakers.json") else: mapping_file_path = args.output_path diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index c9493535..48309dc9 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -10,10 +10,8 @@ import torch from torch.utils.data import DataLoader from TTS.speaker_encoder.dataset import SpeakerEncoderDataset - from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model - from TTS.speaker_encoder.utils.visual import plot_embeddings from TTS.tts.datasets.preprocess import load_meta_data from TTS.utils.arguments import init_training @@ -45,7 +43,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False storage_size=c.storage["storage_size"], sample_from_storage_p=c.storage["sample_from_storage_p"], verbose=verbose, - augmentation_config=c.audio_augmentation + augmentation_config=c.audio_augmentation, ) # sampler = DistributedSampler(dataset) if num_gpus > 1 else None @@ -170,19 +168,18 @@ def main(args): # pylint: disable=redefined-outer-name else: raise Exception("The %s not is a loss supported" % c.loss) - if args.restore_path: checkpoint = torch.load(args.restore_path) try: model.load_state_dict(checkpoint["model"]) - if 'criterion' in checkpoint: + if "criterion" in checkpoint: criterion.load_state_dict(checkpoint["criterion"]) except (KeyError, RuntimeError): print(" > Partial model initialization.") model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint['model'], c) + model_dict = set_init_dict(model_dict, checkpoint["model"], c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index cd95a4f5..6b2b0dd4 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -1,24 +1,25 @@ - import random import numpy as np import torch from torch.utils.data import Dataset + from TTS.speaker_encoder.utils.generic_utils import AugmentWAV, Storage + class SpeakerEncoderDataset(Dataset): def __init__( - self, - ap, - meta_data, - voice_len=1.6, - num_speakers_in_batch=64, - storage_size=1, - sample_from_storage_p=0.5, - num_utter_per_speaker=10, - skip_speakers=False, - verbose=False, - augmentation_config=None + self, + ap, + meta_data, + voice_len=1.6, + num_speakers_in_batch=64, + storage_size=1, + sample_from_storage_p=0.5, + num_utter_per_speaker=10, + skip_speakers=False, + verbose=False, + augmentation_config=None, ): """ Args: @@ -38,23 +39,25 @@ class SpeakerEncoderDataset(Dataset): self.verbose = verbose self.__parse_items() storage_max_size = storage_size * num_speakers_in_batch - self.storage = Storage(maxsize=storage_max_size, storage_batchs=storage_size, num_speakers_in_batch=num_speakers_in_batch) + self.storage = Storage( + maxsize=storage_max_size, storage_batchs=storage_size, num_speakers_in_batch=num_speakers_in_batch + ) self.sample_from_storage_p = float(sample_from_storage_p) speakers_aux = list(self.speakers) speakers_aux.sort() - self.speakerid_to_classid = {key : i for i, key in enumerate(speakers_aux)} + self.speakerid_to_classid = {key: i for i, key in enumerate(speakers_aux)} # Augmentation self.augmentator = None self.gaussian_augmentation_config = None if augmentation_config: - self.data_augmentation_p = augmentation_config['p'] - if self.data_augmentation_p and ('additive' in augmentation_config or 'rir' in augmentation_config): + self.data_augmentation_p = augmentation_config["p"] + if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config): self.augmentator = AugmentWAV(ap, augmentation_config) - if 'gaussian' in augmentation_config.keys(): - self.gaussian_augmentation_config = augmentation_config['gaussian'] + if "gaussian" in augmentation_config.keys(): + self.gaussian_augmentation_config = augmentation_config["gaussian"] if self.verbose: print("\n > DataLoader initialization") @@ -231,9 +234,13 @@ class SpeakerEncoderDataset(Dataset): offset = random.randint(0, wav.shape[0] - self.seq_len) wav = wav[offset : offset + self.seq_len] # add random gaussian noise - if self.gaussian_augmentation_config and self.gaussian_augmentation_config['p']: - if random.random() < self.gaussian_augmentation_config['p']: - wav += np.random.normal(self.gaussian_augmentation_config['min_amplitude'], self.gaussian_augmentation_config['max_amplitude'], size=len(wav)) + if self.gaussian_augmentation_config and self.gaussian_augmentation_config["p"]: + if random.random() < self.gaussian_augmentation_config["p"]: + wav += np.random.normal( + self.gaussian_augmentation_config["min_amplitude"], + self.gaussian_augmentation_config["max_amplitude"], + size=len(wav), + ) mel = self.ap.melspectrogram(wav) feats_.append(torch.FloatTensor(mel)) diff --git a/TTS/speaker_encoder/losses.py b/TTS/speaker_encoder/losses.py index 9b573b6d..ac7e62bf 100644 --- a/TTS/speaker_encoder/losses.py +++ b/TTS/speaker_encoder/losses.py @@ -162,6 +162,7 @@ class AngleProtoLoss(nn.Module): L = self.criterion(cos_sim_matrix, label) return L + class SoftmaxLoss(nn.Module): """ Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982 @@ -169,13 +170,14 @@ class SoftmaxLoss(nn.Module): - embedding_dim (float): speaker embedding dim - n_speakers (float): number of speakers """ + def __init__(self, embedding_dim, n_speakers): super().__init__() self.criterion = torch.nn.CrossEntropyLoss() self.fc = nn.Linear(embedding_dim, n_speakers) - print('Initialised Softmax Loss') + print("Initialised Softmax Loss") def forward(self, x, label=None): # reshape for compatibility @@ -187,6 +189,7 @@ class SoftmaxLoss(nn.Module): return L + class SoftmaxAngleProtoLoss(nn.Module): """ Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153 @@ -196,13 +199,14 @@ class SoftmaxAngleProtoLoss(nn.Module): - init_w (float): defines the initial value of w - init_b (float): definies the initial value of b """ + def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0): super().__init__() self.softmax = SoftmaxLoss(embedding_dim, n_speakers) self.angleproto = AngleProtoLoss(init_w, init_b) - print('Initialised SoftmaxAnglePrototypical Loss') + print("Initialised SoftmaxAnglePrototypical Loss") def forward(self, x, label=None): """ @@ -213,4 +217,4 @@ class SoftmaxAngleProtoLoss(nn.Module): Ls = self.softmax(x, label) - return Ls+Lp + return Ls + Lp diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index aa2171ed..ce86b01f 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -1,7 +1,8 @@ -import torch import numpy as np +import torch import torch.nn as nn + class SELayer(nn.Module): def __init__(self, channel, reduction=8): super(SELayer, self).__init__() @@ -10,7 +11,7 @@ class SELayer(nn.Module): nn.Linear(channel, channel // reduction), nn.ReLU(inplace=True), nn.Linear(channel // reduction, channel), - nn.Sigmoid() + nn.Sigmoid(), ) def forward(self, x): @@ -19,6 +20,7 @@ class SELayer(nn.Module): y = self.fc(y).view(b, c, 1, 1) return x * y + class SEBasicBlock(nn.Module): expansion = 1 @@ -51,12 +53,22 @@ class SEBasicBlock(nn.Module): out = self.relu(out) return out + class ResNetSpeakerEncoder(nn.Module): """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153 Adapted from: https://github.com/clovaai/voxceleb_trainer """ + # pylint: disable=W0102 - def __init__(self, input_dim=64, proj_dim=512, layers=[3, 4, 6, 3], num_filters=[32, 64, 128, 256], encoder_type='ASP', log_input=False): + def __init__( + self, + input_dim=64, + proj_dim=512, + layers=[3, 4, 6, 3], + num_filters=[32, 64, 128, 256], + encoder_type="ASP", + log_input=False, + ): super(ResNetSpeakerEncoder, self).__init__() self.encoder_type = encoder_type @@ -74,7 +86,7 @@ class ResNetSpeakerEncoder(nn.Module): self.instancenorm = nn.InstanceNorm1d(input_dim) - outmap_size = int(self.input_dim/8) + outmap_size = int(self.input_dim / 8) self.attention = nn.Sequential( nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1), @@ -82,14 +94,14 @@ class ResNetSpeakerEncoder(nn.Module): nn.BatchNorm1d(128), nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1), nn.Softmax(dim=2), - ) + ) if self.encoder_type == "SAP": out_dim = num_filters[3] * outmap_size elif self.encoder_type == "ASP": out_dim = num_filters[3] * outmap_size * 2 else: - raise ValueError('Undefined encoder') + raise ValueError("Undefined encoder") self.fc = nn.Linear(out_dim, proj_dim) @@ -98,7 +110,7 @@ class ResNetSpeakerEncoder(nn.Module): def _init_layers(self): for m in self.modules(): if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") elif isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) @@ -107,8 +119,7 @@ class ResNetSpeakerEncoder(nn.Module): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( - nn.Conv2d(self.inplanes, planes * block.expansion, - kernel_size=1, stride=stride, bias=False), + nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(planes * block.expansion), ) @@ -131,7 +142,7 @@ class ResNetSpeakerEncoder(nn.Module): with torch.no_grad(): with torch.cuda.amp.autocast(enabled=False): if self.log_input: - x = (x+1e-6).log() + x = (x + 1e-6).log() x = self.instancenorm(x).unsqueeze(1) x = self.conv1(x) @@ -151,7 +162,7 @@ class ResNetSpeakerEncoder(nn.Module): x = torch.sum(x * w, dim=2) elif self.encoder_type == "ASP": mu = torch.sum(x * w, dim=2) - sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu ** 2).clamp(min=1e-5)) + sg = torch.sqrt((torch.sum((x ** 2) * w, dim=2) - mu ** 2).clamp(min=1e-5)) x = torch.cat((mu, sg), 1) x = x.view(x.size()[0], -1) @@ -172,12 +183,12 @@ class ResNetSpeakerEncoder(nn.Module): if max_len < num_frames: num_frames = max_len - offsets = np.linspace(0, max_len-num_frames, num=num_eval) + offsets = np.linspace(0, max_len - num_frames, num=num_eval) frames_batch = [] for offset in offsets: offset = int(offset) - end_offset = int(offset+num_frames) + end_offset = int(offset + num_frames) frames = x[:, offset:end_offset] frames_batch.append(frames) diff --git a/TTS/speaker_encoder/speaker_encoder_config.py b/TTS/speaker_encoder/speaker_encoder_config.py index 31149822..e830a0f5 100644 --- a/TTS/speaker_encoder/speaker_encoder_config.py +++ b/TTS/speaker_encoder/speaker_encoder_config.py @@ -25,10 +25,7 @@ class SpeakerEncoderConfig(BaseTrainingConfig): } ) - audio_augmentation : dict = field( - default_factory=lambda: { - } - ) + audio_augmentation: dict = field(default_factory=lambda: {}) storage: dict = field( default_factory=lambda: { diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/speaker_encoder/utils/generic_utils.py index 3299f75a..fb61e48e 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/speaker_encoder/utils/generic_utils.py @@ -1,18 +1,18 @@ -import re +import datetime +import glob import os +import random +import re +from multiprocessing import Manager import numpy as np import torch -import glob -import random -import datetime - from scipy import signal -from multiprocessing import Manager from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder + class Storage(object): def __init__(self, maxsize, storage_batchs, num_speakers_in_batch, num_threads=8): # use multiprocessing for threading safe @@ -53,19 +53,19 @@ class Storage(object): return self.storage[random.randint(0, storage_size)] def get_random_sample_fast(self): - '''Call this method only when storage is full''' + """Call this method only when storage is full""" return self.storage[random.randint(0, self.safe_storage_size)] -class AugmentWAV(object): +class AugmentWAV(object): def __init__(self, ap, augmentation_config): self.ap = ap self.use_additive_noise = False - if 'additive' in augmentation_config.keys(): - self.additive_noise_config = augmentation_config['additive'] - additive_path = self.additive_noise_config['sounds_path'] + if "additive" in augmentation_config.keys(): + self.additive_noise_config = augmentation_config["additive"] + additive_path = self.additive_noise_config["sounds_path"] if additive_path: self.use_additive_noise = True # get noise types @@ -74,12 +74,12 @@ class AugmentWAV(object): if isinstance(self.additive_noise_config[key], dict): self.additive_noise_types.append(key) - additive_files = glob.glob(os.path.join(additive_path, '**/*.wav'), recursive=True) + additive_files = glob.glob(os.path.join(additive_path, "**/*.wav"), recursive=True) self.noise_list = {} for wav_file in additive_files: - noise_dir = wav_file.replace(additive_path, '').split(os.sep)[0] + noise_dir = wav_file.replace(additive_path, "").split(os.sep)[0] # ignore not listed directories if noise_dir not in self.additive_noise_types: continue @@ -87,14 +87,16 @@ class AugmentWAV(object): self.noise_list[noise_dir] = [] self.noise_list[noise_dir].append(wav_file) - print(f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}") + print( + f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}" + ) self.use_rir = False - if 'rir' in augmentation_config.keys(): - self.rir_config = augmentation_config['rir'] - if self.rir_config['rir_path']: - self.rir_files = glob.glob(os.path.join(self.rir_config['rir_path'], '**/*.wav'), recursive=True) + if "rir" in augmentation_config.keys(): + self.rir_config = augmentation_config["rir"] + if self.rir_config["rir_path"]: + self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True) self.use_rir = True print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances") @@ -111,9 +113,15 @@ class AugmentWAV(object): def additive_noise(self, noise_type, audio): - clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4) + clean_db = 10 * np.log10(np.mean(audio ** 2) + 1e-4) - noise_list = random.sample(self.noise_list[noise_type], random.randint(self.additive_noise_config[noise_type]['min_num_noises'], self.additive_noise_config[noise_type]['max_num_noises'])) + noise_list = random.sample( + self.noise_list[noise_type], + random.randint( + self.additive_noise_config[noise_type]["min_num_noises"], + self.additive_noise_config[noise_type]["max_num_noises"], + ), + ) audio_len = audio.shape[0] noises_wav = None @@ -123,7 +131,10 @@ class AugmentWAV(object): if noiseaudio.shape[0] < audio_len: continue - noise_snr = random.uniform(self.additive_noise_config[noise_type]['min_snr_in_db'], self.additive_noise_config[noise_type]['max_num_noises']) + noise_snr = random.uniform( + self.additive_noise_config[noise_type]["min_snr_in_db"], + self.additive_noise_config[noise_type]["max_num_noises"], + ) noise_db = 10 * np.log10(np.mean(noiseaudio ** 2) + 1e-4) noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio @@ -144,7 +155,7 @@ class AugmentWAV(object): rir_file = random.choice(self.rir_files) rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate) rir = rir / np.sqrt(np.sum(rir ** 2)) - return signal.convolve(audio, rir, mode=self.rir_config['conv_mode'])[:audio_len] + return signal.convolve(audio, rir, mode=self.rir_config["conv_mode"])[:audio_len] def apply_one(self, audio): noise_type = random.choice(self.global_noise_list) @@ -153,17 +164,25 @@ class AugmentWAV(object): return self.additive_noise(noise_type, audio) + def to_camel(text): text = text.capitalize() return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) + def setup_model(c): - if c.model_params['model_name'].lower() == 'lstm': - model = LSTMSpeakerEncoder(c.model_params["input_dim"], c.model_params["proj_dim"], c.model_params["lstm_dim"], c.model_params["num_lstm_layers"]) - elif c.model_params['model_name'].lower() == 'resnet': + if c.model_params["model_name"].lower() == "lstm": + model = LSTMSpeakerEncoder( + c.model_params["input_dim"], + c.model_params["proj_dim"], + c.model_params["lstm_dim"], + c.model_params["num_lstm_layers"], + ) + elif c.model_params["model_name"].lower() == "resnet": model = ResNetSpeakerEncoder(input_dim=c.model_params["input_dim"], proj_dim=c.model_params["proj_dim"]) return model + def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch): checkpoint_path = "checkpoint_{}.pth.tar".format(current_step) checkpoint_path = os.path.join(out_path, checkpoint_path) diff --git a/tests/test_speaker_encoder.py b/tests/test_speaker_encoder.py index f56a9577..0bb07f37 100644 --- a/tests/test_speaker_encoder.py +++ b/tests/test_speaker_encoder.py @@ -6,6 +6,7 @@ from tests import get_tests_input_path from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder + file_path = get_tests_input_path() @@ -39,6 +40,7 @@ class LSTMSpeakerEncoderTests(unittest.TestCase): assert output.shape[1] == 256 assert len(output.shape) == 2 + class ResNetSpeakerEncoderTests(unittest.TestCase): # pylint: disable=R0201 def test_in_out(self): @@ -65,6 +67,7 @@ class ResNetSpeakerEncoderTests(unittest.TestCase): assert output.shape[1] == 256 assert len(output.shape) == 2 + class GE2ELossTests(unittest.TestCase): # pylint: disable=R0201 def test_in_out(self): @@ -92,6 +95,7 @@ class GE2ELossTests(unittest.TestCase): output = loss.forward(dummy_input) assert output.item() < 0.005 + class AngleProtoLossTests(unittest.TestCase): # pylint: disable=R0201 def test_in_out(self): @@ -121,6 +125,7 @@ class AngleProtoLossTests(unittest.TestCase): output = loss.forward(dummy_input) assert output.item() < 0.005 + class SoftmaxAngleProtoLossTests(unittest.TestCase): # pylint: disable=R0201 def test_in_out(self): diff --git a/tests/test_speaker_encoder_train.py b/tests/test_speaker_encoder_train.py index e168a785..21b12074 100644 --- a/tests/test_speaker_encoder_train.py +++ b/tests/test_speaker_encoder_train.py @@ -46,7 +46,7 @@ run_cli(command_train) shutil.rmtree(continue_path) # test resnet speaker encoder -config.model_params['model_name'] = "resnet" +config.model_params["model_name"] = "resnet" config.save_json(config_path) # train the model for one epoch From 4726ae393d20313d2435d9cb59f3b0f04aa993eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 1 Jun 2021 09:14:16 +0200 Subject: [PATCH 17/36] pylint disable `not-callable` checks due to the warnings on torch layers --- .pylintrc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.pylintrc b/.pylintrc index 1b3d64c2..34c121eb 100644 --- a/.pylintrc +++ b/.pylintrc @@ -158,7 +158,8 @@ disable=missing-docstring, deprecated-sys-function, exception-escape, comprehension-escape, - duplicate-code + duplicate-code, + not-callable # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option From 406c4d057728222602baf05b8cb7ed824d09a04c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 1 Jun 2021 09:15:15 +0200 Subject: [PATCH 18/36] bump pylint version to 2.8.3 --- requirements.dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.dev.txt b/requirements.dev.txt index 144a0ed6..afb5ebe6 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -2,4 +2,4 @@ black coverage isort nose -pylint==2.7.4 +pylint==2.8.3 From d0ab0382fc2edd9dcfc07aa6d4ffdf654adef451 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 1 Jun 2021 09:15:32 +0200 Subject: [PATCH 19/36] linter fixes --- TTS/bin/distribute.py | 2 +- TTS/utils/manage.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/TTS/bin/distribute.py b/TTS/bin/distribute.py index 0bd27275..ea43f88b 100644 --- a/TTS/bin/distribute.py +++ b/TTS/bin/distribute.py @@ -51,7 +51,7 @@ def main(): my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i) command[-1] = "--rank={}".format(i) stdout = None if i == 0 else open(os.devnull, "w") - p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env) + p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env) # pylint: disable=consider-using-with processes.append(p) print(command) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 2e3caa81..cf7df7de 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -149,8 +149,8 @@ class ModelManager(object): def _download_zip_file(file_url, output): """Download the github releases""" r = requests.get(file_url) - z = zipfile.ZipFile(io.BytesIO(r.content)) - z.extractall(output) + with zipfile.ZipFile(io.BytesIO(r.content)) as z: + z.extractall(output) for file_path in z.namelist()[1:]: src_path = os.path.join(output, file_path) dst_path = os.path.join(output, os.path.basename(file_path)) From 5b89ef2c6e5895b168f8f150ddce345dcee6be91 Mon Sep 17 00:00:00 2001 From: Alexander Korolev Date: Tue, 1 Jun 2021 11:06:35 +0200 Subject: [PATCH 20/36] fix speaker-embeddings dimension during inference --- TTS/tts/models/tacotron2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index fded8f87..a5db64e9 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -277,6 +277,7 @@ class Tacotron2(TacotronAbstract): if self.num_speakers > 1: if not self.embeddings_per_sample: speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + speaker_embeddings = torch.unsqueeze(speaker_embeddings, 0).transpose(1, 2) encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(encoder_outputs) From 1cc18d19729545c83e2a7482b949f896fd714ef4 Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Tue, 1 Jun 2021 18:51:34 +0900 Subject: [PATCH 21/36] Move unittest of Japanese phonemizer. --- .../tts_tests/test_japanese_phonemizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename TTS/tts/utils/text/japanese/phonemizer_test.py => tests/tts_tests/test_japanese_phonemizer.py (89%) diff --git a/TTS/tts/utils/text/japanese/phonemizer_test.py b/tests/tts_tests/test_japanese_phonemizer.py similarity index 89% rename from TTS/tts/utils/text/japanese/phonemizer_test.py rename to tests/tts_tests/test_japanese_phonemizer.py index f07c0901..437042f0 100644 --- a/TTS/tts/utils/text/japanese/phonemizer_test.py +++ b/tests/tts_tests/test_japanese_phonemizer.py @@ -1,5 +1,5 @@ import unittest -from .phonemizer import japanese_text_to_phonemes +from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes _TEST_CASES = ''' どちらに行きますか?/dochiraniikimasuka? From c1eb9bdccacfb09356282557bae21885ecaa0dfa Mon Sep 17 00:00:00 2001 From: Alexander Korolev Date: Tue, 1 Jun 2021 15:15:26 +0200 Subject: [PATCH 22/36] fix speaker dim inference --- TTS/tts/models/tacotron2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index a5db64e9..525eb8b3 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -255,6 +255,7 @@ class Tacotron2(TacotronAbstract): if self.num_speakers > 1: if not self.embeddings_per_sample: speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + speaker_embeddings = torch.unsqueeze(speaker_embeddings, 0).transpose(1, 2) encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) From 6d8310d2a99de22e3537321acbf48f9b35b00b14 Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Wed, 2 Jun 2021 07:48:28 +0900 Subject: [PATCH 23/36] Set the version to the same with the dev branch. --- TTS/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/_version.py b/TTS/_version.py index f4956698..311f216e 100644 --- a/TTS/_version.py +++ b/TTS/_version.py @@ -1 +1 @@ -__version__ = "0.0.14.1" +__version__ = "0.0.14" From 0c14278c306fa52408b487b10e75b6aa26525f91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 2 Jun 2021 11:40:26 +0200 Subject: [PATCH 24/36] reorg test files --- .../test_dataset_formatters.py} | 0 tests/{ => data_tests}/test_loader.py | 0 tests/{ => inference_tests}/test_synthesize.py | 0 tests/{ => inference_tests}/test_synthesizer.py | 0 tests/{test_audio.py => test_audio_processor.py} | 0 tests/{tts_tests => text_tests}/test_japanese_phonemizer.py | 0 tests/{ => text_tests}/test_symbols.py | 0 tests/{ => text_tests}/test_text_cleaners.py | 0 tests/{ => tts_tests}/test_feed_forward_layers.py | 0 tests/{ => tts_tests}/test_glow_tts.py | 0 tests/{ => tts_tests}/test_speedy_speech_layers.py | 0 tests/{ => tts_tests}/test_tacotron2_model.py | 0 tests/{ => tts_tests}/test_tacotron2_tf_model.py | 0 tests/{test_layers.py => tts_tests/test_tacotron_layers.py} | 0 tests/{ => tts_tests}/test_tacotron_model.py | 0 15 files changed, 0 insertions(+), 0 deletions(-) rename tests/{test_preprocessors.py => data_tests/test_dataset_formatters.py} (100%) rename tests/{ => data_tests}/test_loader.py (100%) rename tests/{ => inference_tests}/test_synthesize.py (100%) rename tests/{ => inference_tests}/test_synthesizer.py (100%) rename tests/{test_audio.py => test_audio_processor.py} (100%) rename tests/{tts_tests => text_tests}/test_japanese_phonemizer.py (100%) rename tests/{ => text_tests}/test_symbols.py (100%) rename tests/{ => text_tests}/test_text_cleaners.py (100%) rename tests/{ => tts_tests}/test_feed_forward_layers.py (100%) rename tests/{ => tts_tests}/test_glow_tts.py (100%) rename tests/{ => tts_tests}/test_speedy_speech_layers.py (100%) rename tests/{ => tts_tests}/test_tacotron2_model.py (100%) rename tests/{ => tts_tests}/test_tacotron2_tf_model.py (100%) rename tests/{test_layers.py => tts_tests/test_tacotron_layers.py} (100%) rename tests/{ => tts_tests}/test_tacotron_model.py (100%) diff --git a/tests/test_preprocessors.py b/tests/data_tests/test_dataset_formatters.py similarity index 100% rename from tests/test_preprocessors.py rename to tests/data_tests/test_dataset_formatters.py diff --git a/tests/test_loader.py b/tests/data_tests/test_loader.py similarity index 100% rename from tests/test_loader.py rename to tests/data_tests/test_loader.py diff --git a/tests/test_synthesize.py b/tests/inference_tests/test_synthesize.py similarity index 100% rename from tests/test_synthesize.py rename to tests/inference_tests/test_synthesize.py diff --git a/tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py similarity index 100% rename from tests/test_synthesizer.py rename to tests/inference_tests/test_synthesizer.py diff --git a/tests/test_audio.py b/tests/test_audio_processor.py similarity index 100% rename from tests/test_audio.py rename to tests/test_audio_processor.py diff --git a/tests/tts_tests/test_japanese_phonemizer.py b/tests/text_tests/test_japanese_phonemizer.py similarity index 100% rename from tests/tts_tests/test_japanese_phonemizer.py rename to tests/text_tests/test_japanese_phonemizer.py diff --git a/tests/test_symbols.py b/tests/text_tests/test_symbols.py similarity index 100% rename from tests/test_symbols.py rename to tests/text_tests/test_symbols.py diff --git a/tests/test_text_cleaners.py b/tests/text_tests/test_text_cleaners.py similarity index 100% rename from tests/test_text_cleaners.py rename to tests/text_tests/test_text_cleaners.py diff --git a/tests/test_feed_forward_layers.py b/tests/tts_tests/test_feed_forward_layers.py similarity index 100% rename from tests/test_feed_forward_layers.py rename to tests/tts_tests/test_feed_forward_layers.py diff --git a/tests/test_glow_tts.py b/tests/tts_tests/test_glow_tts.py similarity index 100% rename from tests/test_glow_tts.py rename to tests/tts_tests/test_glow_tts.py diff --git a/tests/test_speedy_speech_layers.py b/tests/tts_tests/test_speedy_speech_layers.py similarity index 100% rename from tests/test_speedy_speech_layers.py rename to tests/tts_tests/test_speedy_speech_layers.py diff --git a/tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py similarity index 100% rename from tests/test_tacotron2_model.py rename to tests/tts_tests/test_tacotron2_model.py diff --git a/tests/test_tacotron2_tf_model.py b/tests/tts_tests/test_tacotron2_tf_model.py similarity index 100% rename from tests/test_tacotron2_tf_model.py rename to tests/tts_tests/test_tacotron2_tf_model.py diff --git a/tests/test_layers.py b/tests/tts_tests/test_tacotron_layers.py similarity index 100% rename from tests/test_layers.py rename to tests/tts_tests/test_tacotron_layers.py diff --git a/tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py similarity index 100% rename from tests/test_tacotron_model.py rename to tests/tts_tests/test_tacotron_model.py From 49c5e5d820e2413acb80c3b9004e5a5243d44f41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 2 Jun 2021 11:42:38 +0200 Subject: [PATCH 25/36] maket style japanese PR --- TTS/tts/datasets/preprocess.py | 2 +- TTS/tts/utils/text/japanese/phonemizer.py | 637 ++++++++++--------- tests/text_tests/test_japanese_phonemizer.py | 14 +- 3 files changed, 328 insertions(+), 325 deletions(-) diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 271b1734..62cb9fef 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -434,7 +434,7 @@ def kokoro(root_path, meta_file): with open(txt_file, "r") as ttf: for line in ttf: cols = line.split("|") - wav_file = os.path.join(root_path, "wavs", cols[0] + '.wav') + wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav") text = cols[2].replace(" ", "") items.append([text, wav_file, speaker_name]) return items diff --git a/TTS/tts/utils/text/japanese/phonemizer.py b/TTS/tts/utils/text/japanese/phonemizer.py index f09d5b05..a4629a30 100644 --- a/TTS/tts/utils/text/japanese/phonemizer.py +++ b/TTS/tts/utils/text/japanese/phonemizer.py @@ -2,324 +2,321 @@ # compatible with Julius https://github.com/julius-speech/segmentation-kit import re + import MeCab _CONVRULES = [ # Conversion of 2 letters - 'アァ/ a a', - 'イィ/ i i', - 'イェ/ i e', - 'イャ/ y a', - 'ウゥ/ u:', - 'エェ/ e e', - 'オォ/ o:', - 'カァ/ k a:', - 'キィ/ k i:', - 'クゥ/ k u:', - 'クャ/ ky a', - 'クュ/ ky u', - 'クョ/ ky o', - 'ケェ/ k e:', - 'コォ/ k o:', - 'ガァ/ g a:', - 'ギィ/ g i:', - 'グゥ/ g u:', - 'グャ/ gy a', - 'グュ/ gy u', - 'グョ/ gy o', - 'ゲェ/ g e:', - 'ゴォ/ g o:', - 'サァ/ s a:', - 'シィ/ sh i:', - 'スゥ/ s u:', - 'スャ/ sh a', - 'スュ/ sh u', - 'スョ/ sh o', - 'セェ/ s e:', - 'ソォ/ s o:', - 'ザァ/ z a:', - 'ジィ/ j i:', - 'ズゥ/ z u:', - 'ズャ/ zy a', - 'ズュ/ zy u', - 'ズョ/ zy o', - 'ゼェ/ z e:', - 'ゾォ/ z o:', - 'タァ/ t a:', - 'チィ/ ch i:', - 'ツァ/ ts a', - 'ツィ/ ts i', - 'ツゥ/ ts u:', - 'ツャ/ ch a', - 'ツュ/ ch u', - 'ツョ/ ch o', - 'ツェ/ ts e', - 'ツォ/ ts o', - 'テェ/ t e:', - 'トォ/ t o:', - 'ダァ/ d a:', - 'ヂィ/ j i:', - 'ヅゥ/ d u:', - 'ヅャ/ zy a', - 'ヅュ/ zy u', - 'ヅョ/ zy o', - 'デェ/ d e:', - 'ドォ/ d o:', - 'ナァ/ n a:', - 'ニィ/ n i:', - 'ヌゥ/ n u:', - 'ヌャ/ ny a', - 'ヌュ/ ny u', - 'ヌョ/ ny o', - 'ネェ/ n e:', - 'ノォ/ n o:', - 'ハァ/ h a:', - 'ヒィ/ h i:', - 'フゥ/ f u:', - 'フャ/ hy a', - 'フュ/ hy u', - 'フョ/ hy o', - 'ヘェ/ h e:', - 'ホォ/ h o:', - 'バァ/ b a:', - 'ビィ/ b i:', - 'ブゥ/ b u:', - 'フャ/ hy a', - 'ブュ/ by u', - 'フョ/ hy o', - 'ベェ/ b e:', - 'ボォ/ b o:', - 'パァ/ p a:', - 'ピィ/ p i:', - 'プゥ/ p u:', - 'プャ/ py a', - 'プュ/ py u', - 'プョ/ py o', - 'ペェ/ p e:', - 'ポォ/ p o:', - 'マァ/ m a:', - 'ミィ/ m i:', - 'ムゥ/ m u:', - 'ムャ/ my a', - 'ムュ/ my u', - 'ムョ/ my o', - 'メェ/ m e:', - 'モォ/ m o:', - 'ヤァ/ y a:', - 'ユゥ/ y u:', - 'ユャ/ y a:', - 'ユュ/ y u:', - 'ユョ/ y o:', - 'ヨォ/ y o:', - 'ラァ/ r a:', - 'リィ/ r i:', - 'ルゥ/ r u:', - 'ルャ/ ry a', - 'ルュ/ ry u', - 'ルョ/ ry o', - 'レェ/ r e:', - 'ロォ/ r o:', - 'ワァ/ w a:', - 'ヲォ/ o:', - 'ディ/ d i', - 'デェ/ d e:', - 'デャ/ dy a', - 'デュ/ dy u', - 'デョ/ dy o', - 'ティ/ t i', - 'テェ/ t e:', - 'テャ/ ty a', - 'テュ/ ty u', - 'テョ/ ty o', - 'スィ/ s i', - 'ズァ/ z u a', - 'ズィ/ z i', - 'ズゥ/ z u', - 'ズャ/ zy a', - 'ズュ/ zy u', - 'ズョ/ zy o', - 'ズェ/ z e', - 'ズォ/ z o', - 'キャ/ ky a', - 'キュ/ ky u', - 'キョ/ ky o', - 'シャ/ sh a', - 'シュ/ sh u', - 'シェ/ sh e', - 'ショ/ sh o', - 'チャ/ ch a', - 'チュ/ ch u', - 'チェ/ ch e', - 'チョ/ ch o', - 'トゥ/ t u', - 'トャ/ ty a', - 'トュ/ ty u', - 'トョ/ ty o', - 'ドァ/ d o a', - 'ドゥ/ d u', - 'ドャ/ dy a', - 'ドュ/ dy u', - 'ドョ/ dy o', - 'ドォ/ d o:', - 'ニャ/ ny a', - 'ニュ/ ny u', - 'ニョ/ ny o', - 'ヒャ/ hy a', - 'ヒュ/ hy u', - 'ヒョ/ hy o', - 'ミャ/ my a', - 'ミュ/ my u', - 'ミョ/ my o', - 'リャ/ ry a', - 'リュ/ ry u', - 'リョ/ ry o', - 'ギャ/ gy a', - 'ギュ/ gy u', - 'ギョ/ gy o', - 'ヂェ/ j e', - 'ヂャ/ j a', - 'ヂュ/ j u', - 'ヂョ/ j o', - 'ジェ/ j e', - 'ジャ/ j a', - 'ジュ/ j u', - 'ジョ/ j o', - 'ビャ/ by a', - 'ビュ/ by u', - 'ビョ/ by o', - 'ピャ/ py a', - 'ピュ/ py u', - 'ピョ/ py o', - 'ウァ/ u a', - 'ウィ/ w i', - 'ウェ/ w e', - 'ウォ/ w o', - 'ファ/ f a', - 'フィ/ f i', - 'フゥ/ f u', - 'フャ/ hy a', - 'フュ/ hy u', - 'フョ/ hy o', - 'フェ/ f e', - 'フォ/ f o', - 'ヴァ/ b a', - 'ヴィ/ b i', - 'ヴェ/ b e', - 'ヴォ/ b o', - 'ヴュ/ by u', - + "アァ/ a a", + "イィ/ i i", + "イェ/ i e", + "イャ/ y a", + "ウゥ/ u:", + "エェ/ e e", + "オォ/ o:", + "カァ/ k a:", + "キィ/ k i:", + "クゥ/ k u:", + "クャ/ ky a", + "クュ/ ky u", + "クョ/ ky o", + "ケェ/ k e:", + "コォ/ k o:", + "ガァ/ g a:", + "ギィ/ g i:", + "グゥ/ g u:", + "グャ/ gy a", + "グュ/ gy u", + "グョ/ gy o", + "ゲェ/ g e:", + "ゴォ/ g o:", + "サァ/ s a:", + "シィ/ sh i:", + "スゥ/ s u:", + "スャ/ sh a", + "スュ/ sh u", + "スョ/ sh o", + "セェ/ s e:", + "ソォ/ s o:", + "ザァ/ z a:", + "ジィ/ j i:", + "ズゥ/ z u:", + "ズャ/ zy a", + "ズュ/ zy u", + "ズョ/ zy o", + "ゼェ/ z e:", + "ゾォ/ z o:", + "タァ/ t a:", + "チィ/ ch i:", + "ツァ/ ts a", + "ツィ/ ts i", + "ツゥ/ ts u:", + "ツャ/ ch a", + "ツュ/ ch u", + "ツョ/ ch o", + "ツェ/ ts e", + "ツォ/ ts o", + "テェ/ t e:", + "トォ/ t o:", + "ダァ/ d a:", + "ヂィ/ j i:", + "ヅゥ/ d u:", + "ヅャ/ zy a", + "ヅュ/ zy u", + "ヅョ/ zy o", + "デェ/ d e:", + "ドォ/ d o:", + "ナァ/ n a:", + "ニィ/ n i:", + "ヌゥ/ n u:", + "ヌャ/ ny a", + "ヌュ/ ny u", + "ヌョ/ ny o", + "ネェ/ n e:", + "ノォ/ n o:", + "ハァ/ h a:", + "ヒィ/ h i:", + "フゥ/ f u:", + "フャ/ hy a", + "フュ/ hy u", + "フョ/ hy o", + "ヘェ/ h e:", + "ホォ/ h o:", + "バァ/ b a:", + "ビィ/ b i:", + "ブゥ/ b u:", + "フャ/ hy a", + "ブュ/ by u", + "フョ/ hy o", + "ベェ/ b e:", + "ボォ/ b o:", + "パァ/ p a:", + "ピィ/ p i:", + "プゥ/ p u:", + "プャ/ py a", + "プュ/ py u", + "プョ/ py o", + "ペェ/ p e:", + "ポォ/ p o:", + "マァ/ m a:", + "ミィ/ m i:", + "ムゥ/ m u:", + "ムャ/ my a", + "ムュ/ my u", + "ムョ/ my o", + "メェ/ m e:", + "モォ/ m o:", + "ヤァ/ y a:", + "ユゥ/ y u:", + "ユャ/ y a:", + "ユュ/ y u:", + "ユョ/ y o:", + "ヨォ/ y o:", + "ラァ/ r a:", + "リィ/ r i:", + "ルゥ/ r u:", + "ルャ/ ry a", + "ルュ/ ry u", + "ルョ/ ry o", + "レェ/ r e:", + "ロォ/ r o:", + "ワァ/ w a:", + "ヲォ/ o:", + "ディ/ d i", + "デェ/ d e:", + "デャ/ dy a", + "デュ/ dy u", + "デョ/ dy o", + "ティ/ t i", + "テェ/ t e:", + "テャ/ ty a", + "テュ/ ty u", + "テョ/ ty o", + "スィ/ s i", + "ズァ/ z u a", + "ズィ/ z i", + "ズゥ/ z u", + "ズャ/ zy a", + "ズュ/ zy u", + "ズョ/ zy o", + "ズェ/ z e", + "ズォ/ z o", + "キャ/ ky a", + "キュ/ ky u", + "キョ/ ky o", + "シャ/ sh a", + "シュ/ sh u", + "シェ/ sh e", + "ショ/ sh o", + "チャ/ ch a", + "チュ/ ch u", + "チェ/ ch e", + "チョ/ ch o", + "トゥ/ t u", + "トャ/ ty a", + "トュ/ ty u", + "トョ/ ty o", + "ドァ/ d o a", + "ドゥ/ d u", + "ドャ/ dy a", + "ドュ/ dy u", + "ドョ/ dy o", + "ドォ/ d o:", + "ニャ/ ny a", + "ニュ/ ny u", + "ニョ/ ny o", + "ヒャ/ hy a", + "ヒュ/ hy u", + "ヒョ/ hy o", + "ミャ/ my a", + "ミュ/ my u", + "ミョ/ my o", + "リャ/ ry a", + "リュ/ ry u", + "リョ/ ry o", + "ギャ/ gy a", + "ギュ/ gy u", + "ギョ/ gy o", + "ヂェ/ j e", + "ヂャ/ j a", + "ヂュ/ j u", + "ヂョ/ j o", + "ジェ/ j e", + "ジャ/ j a", + "ジュ/ j u", + "ジョ/ j o", + "ビャ/ by a", + "ビュ/ by u", + "ビョ/ by o", + "ピャ/ py a", + "ピュ/ py u", + "ピョ/ py o", + "ウァ/ u a", + "ウィ/ w i", + "ウェ/ w e", + "ウォ/ w o", + "ファ/ f a", + "フィ/ f i", + "フゥ/ f u", + "フャ/ hy a", + "フュ/ hy u", + "フョ/ hy o", + "フェ/ f e", + "フォ/ f o", + "ヴァ/ b a", + "ヴィ/ b i", + "ヴェ/ b e", + "ヴォ/ b o", + "ヴュ/ by u", # Conversion of 1 letter - 'ア/ a', - 'イ/ i', - 'ウ/ u', - 'エ/ e', - 'オ/ o', - 'カ/ k a', - 'キ/ k i', - 'ク/ k u', - 'ケ/ k e', - 'コ/ k o', - 'サ/ s a', - 'シ/ sh i', - 'ス/ s u', - 'セ/ s e', - 'ソ/ s o', - 'タ/ t a', - 'チ/ ch i', - 'ツ/ ts u', - 'テ/ t e', - 'ト/ t o', - 'ナ/ n a', - 'ニ/ n i', - 'ヌ/ n u', - 'ネ/ n e', - 'ノ/ n o', - 'ハ/ h a', - 'ヒ/ h i', - 'フ/ f u', - 'ヘ/ h e', - 'ホ/ h o', - 'マ/ m a', - 'ミ/ m i', - 'ム/ m u', - 'メ/ m e', - 'モ/ m o', - 'ラ/ r a', - 'リ/ r i', - 'ル/ r u', - 'レ/ r e', - 'ロ/ r o', - 'ガ/ g a', - 'ギ/ g i', - 'グ/ g u', - 'ゲ/ g e', - 'ゴ/ g o', - 'ザ/ z a', - 'ジ/ j i', - 'ズ/ z u', - 'ゼ/ z e', - 'ゾ/ z o', - 'ダ/ d a', - 'ヂ/ j i', - 'ヅ/ z u', - 'デ/ d e', - 'ド/ d o', - 'バ/ b a', - 'ビ/ b i', - 'ブ/ b u', - 'ベ/ b e', - 'ボ/ b o', - 'パ/ p a', - 'ピ/ p i', - 'プ/ p u', - 'ペ/ p e', - 'ポ/ p o', - 'ヤ/ y a', - 'ユ/ y u', - 'ヨ/ y o', - 'ワ/ w a', - 'ヰ/ i', - 'ヱ/ e', - 'ヲ/ o', - 'ン/ N', - 'ッ/ q', - 'ヴ/ b u', - 'ー/:', - + "ア/ a", + "イ/ i", + "ウ/ u", + "エ/ e", + "オ/ o", + "カ/ k a", + "キ/ k i", + "ク/ k u", + "ケ/ k e", + "コ/ k o", + "サ/ s a", + "シ/ sh i", + "ス/ s u", + "セ/ s e", + "ソ/ s o", + "タ/ t a", + "チ/ ch i", + "ツ/ ts u", + "テ/ t e", + "ト/ t o", + "ナ/ n a", + "ニ/ n i", + "ヌ/ n u", + "ネ/ n e", + "ノ/ n o", + "ハ/ h a", + "ヒ/ h i", + "フ/ f u", + "ヘ/ h e", + "ホ/ h o", + "マ/ m a", + "ミ/ m i", + "ム/ m u", + "メ/ m e", + "モ/ m o", + "ラ/ r a", + "リ/ r i", + "ル/ r u", + "レ/ r e", + "ロ/ r o", + "ガ/ g a", + "ギ/ g i", + "グ/ g u", + "ゲ/ g e", + "ゴ/ g o", + "ザ/ z a", + "ジ/ j i", + "ズ/ z u", + "ゼ/ z e", + "ゾ/ z o", + "ダ/ d a", + "ヂ/ j i", + "ヅ/ z u", + "デ/ d e", + "ド/ d o", + "バ/ b a", + "ビ/ b i", + "ブ/ b u", + "ベ/ b e", + "ボ/ b o", + "パ/ p a", + "ピ/ p i", + "プ/ p u", + "ペ/ p e", + "ポ/ p o", + "ヤ/ y a", + "ユ/ y u", + "ヨ/ y o", + "ワ/ w a", + "ヰ/ i", + "ヱ/ e", + "ヲ/ o", + "ン/ N", + "ッ/ q", + "ヴ/ b u", + "ー/:", # Try converting broken text - 'ァ/ a', - 'ィ/ i', - 'ゥ/ u', - 'ェ/ e', - 'ォ/ o', - 'ヮ/ w a', - 'ォ/ o', - + "ァ/ a", + "ィ/ i", + "ゥ/ u", + "ェ/ e", + "ォ/ o", + "ヮ/ w a", + "ォ/ o", # Symbols - '、/ ,', - '。/ .', - '!/ !', - '?/ ?', - '・/ ,' + "、/ ,", + "。/ .", + "!/ !", + "?/ ?", + "・/ ,", ] -_COLON_RX = re.compile(':+') -_REJECT_RX = re.compile('[^ a-zA-Z:,.?]') +_COLON_RX = re.compile(":+") +_REJECT_RX = re.compile("[^ a-zA-Z:,.?]") + def _makerulemap(): - l = [tuple(x.split('/')) for x in _CONVRULES] - return tuple( - {k: v for k, v in l if len(k) == i} - for i in (1, 2) - ) + l = [tuple(x.split("/")) for x in _CONVRULES] + return tuple({k: v for k, v in l if len(k) == i} for i in (1, 2)) + _RULEMAP1, _RULEMAP2 = _makerulemap() + def kata2phoneme(text: str) -> str: - """Convert katakana text to phonemes. - """ + """Convert katakana text to phonemes.""" text = text.strip() - res = '' + res = "" while text: if len(text) >= 2: x = _RULEMAP2.get(text[:2]) @@ -332,30 +329,34 @@ def kata2phoneme(text: str) -> str: text = text[1:] res += x continue - res += ' ' + text[0] + res += " " + text[0] text = text[1:] - res = _COLON_RX.sub(':', res) + res = _COLON_RX.sub(":", res) return res[1:] -_KATAKANA = ''.join(chr(ch) for ch in range(ord('ァ'), ord('ン') + 1)) -_HIRAGANA = ''.join(chr(ch) for ch in range(ord('ぁ'), ord('ん') + 1)) + +_KATAKANA = "".join(chr(ch) for ch in range(ord("ァ"), ord("ン") + 1)) +_HIRAGANA = "".join(chr(ch) for ch in range(ord("ぁ"), ord("ん") + 1)) _HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA) + def hira2kata(text: str) -> str: text = text.translate(_HIRA2KATATRANS) - return text.replace('う゛', 'ヴ') + return text.replace("う゛", "ヴ") -_SYMBOL_TOKENS = set(list('・、。?!')) -_NO_YOMI_TOKENS = set(list('「」『』―()[][] …')) + +_SYMBOL_TOKENS = set(list("・、。?!")) +_NO_YOMI_TOKENS = set(list("「」『』―()[][] …")) _TAGGER = MeCab.Tagger() + def text2kata(text: str) -> str: parsed = _TAGGER.parse(text) res = [] - for line in parsed.split('\n'): - if line == 'EOS': + for line in parsed.split("\n"): + if line == "EOS": break - parts = line.split('\t') + parts = line.split("\t") word, yomi = parts[0], parts[1] if yomi: @@ -363,17 +364,17 @@ def text2kata(text: str) -> str: else: if word in _SYMBOL_TOKENS: res.append(word) - elif word in ('っ', 'ッ'): - res.append('ッ') + elif word in ("っ", "ッ"): + res.append("ッ") elif word in _NO_YOMI_TOKENS: pass else: res.append(word) - return hira2kata(''.join(res)) + return hira2kata("".join(res)) + def japanese_text_to_phonemes(text: str) -> str: - """Convert Japanese text to phonemes. - """ + """Convert Japanese text to phonemes.""" res = text2kata(text) res = kata2phoneme(res) - return res.replace(' ', '') + return res.replace(" ", "") diff --git a/tests/text_tests/test_japanese_phonemizer.py b/tests/text_tests/test_japanese_phonemizer.py index 437042f0..b3b1ece3 100644 --- a/tests/text_tests/test_japanese_phonemizer.py +++ b/tests/text_tests/test_japanese_phonemizer.py @@ -1,7 +1,8 @@ import unittest + from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes -_TEST_CASES = ''' +_TEST_CASES = """ どちらに行きますか?/dochiraniikimasuka? 今日は温泉に、行きます。/kyo:waoNseNni,ikimasu. 「A」から「Z」までです。/AkaraZmadedesu. @@ -9,14 +10,15 @@ _TEST_CASES = ''' クジラは哺乳類です。/kujirawahonyu:ruidesu. ヴィディオを見ます。/bidioomimasu. ky o: w a o N s e N n i , i k i m a s u ./kyo:waoNseNni,ikimasu. -''' +""" + class TestText(unittest.TestCase): - def test_japanese_text_to_phonemes(self): - for line in _TEST_CASES.strip().split('\n'): - text, phone = line.split('/') + for line in _TEST_CASES.strip().split("\n"): + text, phone = line.split("/") self.assertEqual(japanese_text_to_phonemes(text), phone) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() From 4baa59d73214d4871b4aa2262e28d0ba3f149d5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 2 Jun 2021 11:42:56 +0200 Subject: [PATCH 26/36] comment `requirements.txt` for japanese deps --- requirements.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index ab828503..fde48978 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,5 +19,6 @@ numba==0.52 umap-learn==0.4.6 anyascii coqpit -mecab-python3 -unidic-lite +# japanese g2p deps +mecab-python3==1.0.3 +unidic-lite==1.0.8 From db48c69f0f71139ba5cc6f2fe59c2165b6873fbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 2 Jun 2021 11:43:27 +0200 Subject: [PATCH 27/36] reduce fullband melgan model size for testing --- tests/vocoder_tests/test_fullband_melgan_train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py index d9bc51ac..2b286b91 100644 --- a/tests/vocoder_tests/test_fullband_melgan_train.py +++ b/tests/vocoder_tests/test_fullband_melgan_train.py @@ -20,6 +20,7 @@ config = FullbandMelganConfig( eval_split_size=1, print_step=1, print_eval=True, + discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, data_path="tests/data/ljspeech", output_path=output_path, ) From 401fbd8978862346f35e8b0d0206a2c8c8abd75b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 2 Jun 2021 11:48:17 +0200 Subject: [PATCH 28/36] bump up to v0.0.15 --- TTS/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/_version.py b/TTS/_version.py index 311f216e..6561790f 100644 --- a/TTS/_version.py +++ b/TTS/_version.py @@ -1 +1 @@ -__version__ = "0.0.14" +__version__ = "0.0.15" From bd434636a9774e285db046569707c5846ba8ba2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 2 Jun 2021 15:54:37 +0200 Subject: [PATCH 29/36] new japanese model placeholder in `.models.json` --- TTS/.models.json | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/TTS/.models.json b/TTS/.models.json index b926f120..aed546f2 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -149,6 +149,17 @@ "needs_phonemizer": true } } + }, + "jp":{ + "kokoro":{ + "tacotron2-DDC":{ + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.15/tts_models--jp--kokoro--tacotron2-DDC.zip", + "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan", + "author": "@kaiidams", + "commit": "401fbd89", + "needs_phonemizer": false + } + } } }, "vocoder_models":{ From e66753bd0dc2219c4bac42cb74cb264121296a0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 3 Jun 2021 18:04:28 +0200 Subject: [PATCH 30/36] fixup! new japanese model placeholder in `.models.json` --- TTS/.models.json | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/TTS/.models.json b/TTS/.models.json index aed546f2..310dc5f0 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -150,11 +150,12 @@ } } }, - "jp":{ + "ja":{ "kokoro":{ "tacotron2-DDC":{ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.15/tts_models--jp--kokoro--tacotron2-DDC.zip", - "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan", + "default_vocoder": "vocoder_models/universal/libri-tts/wavegrad", + "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.", "author": "@kaiidams", "commit": "401fbd89", "needs_phonemizer": false From ba9bcf7c6bdd39d8fbbeef5d34e8c02313569a80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 4 Jun 2021 10:10:51 +0200 Subject: [PATCH 31/36] auto upload to pypi on release --- .github/workflows/pypi-release.yml | 38 +++++++++++++ TTS/VERSION | 1 + TTS/__init__.py | 8 ++- TTS/_version.py | 1 - setup.py | 91 +++++++++++++++--------------- 5 files changed, 90 insertions(+), 49 deletions(-) create mode 100644 .github/workflows/pypi-release.yml create mode 100644 TTS/VERSION delete mode 100644 TTS/_version.py diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml new file mode 100644 index 00000000..d31e71cf --- /dev/null +++ b/.github/workflows/pypi-release.yml @@ -0,0 +1,38 @@ +name: Publish Python 🐍 distributions 📦 to PyPI +on: + release: + types: [published] +defaults: + run: + shell: + bash +jobs: + build-package: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + - name: Verify tag matches version + run: | + set -ex + version=$(cat TTS/VERSION) + tag="${GITHUB_REF/refs\/tags\/}" + if [[ "v$version" != "$tag" ]]; then + exit 1 + fi + - uses: actions/setup-python@v2 + with: + python-version: 3.8 + - run: | + python -m pip install -U pip setuptools twine toml + python -c 'import toml; c = toml.load("pyproject.toml"); print("\n".join(c["build-system"]["requires"]))' | pip install -r /dev/stdin + - run: | + python setup.py sdist + - name: Setup PyPI config + run: | + cat << EOF > ~/.pypirc + [pypi] + username=__token__ + password=${{ secrets.PYPI_TOKEN }} + EOF + - run: | + twine upload --repository pypi dist/*.tar.gz diff --git a/TTS/VERSION b/TTS/VERSION new file mode 100644 index 00000000..13511bd9 --- /dev/null +++ b/TTS/VERSION @@ -0,0 +1 @@ +0.0.14.1-alpha.2 diff --git a/TTS/__init__.py b/TTS/__init__.py index 8dee4bf8..da35faf8 100644 --- a/TTS/__init__.py +++ b/TTS/__init__.py @@ -1 +1,7 @@ -from ._version import __version__ +import os + + +with open(os.path.join(os.path.dirname(__file__), "VERSION")) as f: + version = f.read().strip() + +__version__ = version diff --git a/TTS/_version.py b/TTS/_version.py deleted file mode 100644 index 6561790f..00000000 --- a/TTS/_version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = "0.0.15" diff --git a/setup.py b/setup.py index a68b09e0..7cfb6519 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,6 @@ import os import subprocess import sys from distutils.version import LooseVersion -from TTS._version import __version__ import numpy import setuptools.command.build_py @@ -12,82 +11,85 @@ import setuptools.command.develop from Cython.Build import cythonize from setuptools import Extension, find_packages, setup + if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.9"): - raise RuntimeError( - "TTS requires python >= 3.6 and <3.9 " - "but your Python version is {}".format(sys.version) - ) + raise RuntimeError("TTS requires python >= 3.6 and <3.9 " "but your Python version is {}".format(sys.version)) -version = __version__ cwd = os.path.dirname(os.path.abspath(__file__)) +cwd = os.path.dirname(os.path.abspath(__file__)) +with open(os.path.join(cwd, "TTS", "VERSION")) as fin: + version = fin.read().strip() + + class build_py(setuptools.command.build_py.build_py): # pylint: disable=too-many-ancestors def run(self): - self.create_version_file() setuptools.command.build_py.build_py.run(self) - @staticmethod - def create_version_file(): - print('-- Building version ' + version) - version_path = os.path.join(cwd, 'version.py') - with open(version_path, 'w') as f: - f.write("__version__ = '{}'\n".format(version)) class develop(setuptools.command.develop.develop): def run(self): - build_py.create_version_file() setuptools.command.develop.develop.run(self) # The documentation for this feature is in server/README.md -package_data = ['TTS/server/templates/*'] +package_data = ["TTS/server/templates/*"] def pip_install(package_name): - subprocess.call([sys.executable, '-m', 'pip', 'install', package_name]) + subprocess.call([sys.executable, "-m", "pip", "install", package_name]) -requirements = open(os.path.join(cwd, 'requirements.txt'), 'r').readlines() -with open(os.path.join(cwd, 'requirements.notebooks.txt'), 'r') as f: +requirements = open(os.path.join(cwd, "requirements.txt"), "r").readlines() +with open(os.path.join(cwd, "requirements.notebooks.txt"), "r") as f: requirements_notebooks = f.readlines() -with open(os.path.join(cwd, 'requirements.dev.txt'), 'r') as f: +with open(os.path.join(cwd, "requirements.dev.txt"), "r") as f: requirements_dev = f.readlines() -with open(os.path.join(cwd, 'requirements.tf.txt'), 'r') as f: +with open(os.path.join(cwd, "requirements.tf.txt"), "r") as f: requirements_tf = f.readlines() requirements_all = requirements_dev + requirements_notebooks + requirements_tf -with open('README.md', "r", encoding="utf-8") as readme_file: +with open("README.md", "r", encoding="utf-8") as readme_file: README = readme_file.read() -exts = [Extension(name='TTS.tts.layers.glow_tts.monotonic_align.core', - sources=["TTS/tts/layers/glow_tts/monotonic_align/core.pyx"])] +exts = [ + Extension( + name="TTS.tts.layers.glow_tts.monotonic_align.core", + sources=["TTS/tts/layers/glow_tts/monotonic_align/core.pyx"], + ) +] setup( - name='TTS', + name="TTS", version=version, - url='https://github.com/coqui-ai/TTS', - author='Eren Gölge', - author_email='egolge@coqui.ai', - description='Deep learning for Text to Speech by Coqui.', + url="https://github.com/coqui-ai/TTS", + author="Eren Gölge", + author_email="egolge@coqui.ai", + description="Deep learning for Text to Speech by Coqui.", long_description=README, long_description_content_type="text/markdown", - license='MPL-2.0', + license="MPL-2.0", # cython include_dirs=numpy.get_include(), ext_modules=cythonize(exts, language_level=3), # ext_modules=find_cython_extensions(), # package include_package_data=True, - packages=find_packages(include=['TTS*']), + packages=find_packages(include=["TTS*"]), + package_data={ + "TTS": [ + "VERSION", + ] + }, project_urls={ - 'Documentation': 'https://github.com/coqui-ai/TTS/wiki', - 'Tracker': 'https://github.com/coqui-ai/TTS/issues', - 'Repository': 'https://github.com/coqui-ai/TTS', - 'Discussions': 'https://github.com/coqui-ai/TTS/discussions', + "Documentation": "https://github.com/coqui-ai/TTS/wiki", + "Tracker": "https://github.com/coqui-ai/TTS/issues", + "Repository": "https://github.com/coqui-ai/TTS", + "Discussions": "https://github.com/coqui-ai/TTS/discussions", }, cmdclass={ - 'build_py': build_py, - 'develop': develop, + "build_py": build_py, + "develop": develop, # 'build_ext': build_ext }, install_requires=requirements, @@ -97,30 +99,25 @@ setup( "notebooks": requirements_notebooks, "tf": requirements_tf, }, - python_requires='>=3.6.0, <3.9', - entry_points={ - 'console_scripts': [ - 'tts=TTS.bin.synthesize:main', - 'tts-server = TTS.server.server:main' - ] - }, + python_requires=">=3.6.0, <3.9", + entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]}, classifiers=[ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", - 'Development Status :: 3 - Alpha', + "Development Status :: 3 - Alpha", "Intended Audience :: Science/Research", "Intended Audience :: Developers", "Operating System :: POSIX :: Linux", - 'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)', + "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)", "Topic :: Software Development", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Multimedia :: Sound/Audio :: Speech", "Topic :: Multimedia :: Sound/Audio", "Topic :: Multimedia", - "Topic :: Scientific/Engineering :: Artificial Intelligence" + "Topic :: Scientific/Engineering :: Artificial Intelligence", ], - zip_safe=False + zip_safe=False, ) From 203ab855c316198b084bc7a50fdbccc31e021769 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 4 Jun 2021 13:52:54 +0200 Subject: [PATCH 32/36] bump up to v0.0.15 --- TTS/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/VERSION b/TTS/VERSION index 13511bd9..ceddfb28 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.0.14.1-alpha.2 +0.0.15 From b8b79a5e5a1f175680a63539cd4235f18cb3ead8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 4 Jun 2021 14:02:53 +0200 Subject: [PATCH 33/36] fix `use_cuda` bug in `server.py` --- TTS/server/server.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/TTS/server/server.py b/TTS/server/server.py index 15a6b292..dc025b32 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -99,7 +99,9 @@ if args.vocoder_path is not None: vocoder_config_path = args.vocoder_config_path # load models -synthesizer = Synthesizer(model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, args.use_cuda) +synthesizer = Synthesizer( + model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, use_cuda=args.use_cuda +) use_multi_speaker = synthesizer.speaker_manager is not None # TODO: set this from SpeakerManager From ed6e109aecad4a34576ac5288ca4cf57cf0e7141 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 8 Jun 2021 09:17:05 +0200 Subject: [PATCH 34/36] add missing VERSION to manifest --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index 664295c7..861cb5a7 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,7 @@ include README.md include LICENSE.txt include requirements.*.txt +include TTS/VERSION recursive-include TTS *.json recursive-include TTS *.html recursive-include TTS *.png From b0aa18934870cb0120703346c766325af81135bc Mon Sep 17 00:00:00 2001 From: Adam Froghyar Date: Mon, 14 Jun 2021 10:44:00 +0200 Subject: [PATCH 35/36] Forcing do_trim_silence to False in the extract TTS script --- TTS/bin/extract_tts_spectrograms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index ace7464a..4eb79d76 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -299,4 +299,5 @@ if __name__ == "__main__": args = parser.parse_args() c = load_config(args.config_path) + c.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel main(args) From d85ee901d57b4a08301ef569d3c48dd032508ff7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 15 Jun 2021 10:53:53 +0200 Subject: [PATCH 36/36] Fix #571 --- TTS/bin/extract_tts_spectrograms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index ace7464a..2be9d760 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -299,4 +299,5 @@ if __name__ == "__main__": args = parser.parse_args() c = load_config(args.config_path) + C.audio['do_trim_silence'] = False main(args)