From 40f44757233fe3c80e3ea73e1bfa033cc2d375bf Mon Sep 17 00:00:00 2001 From: Adonis Pujols Date: Thu, 11 Feb 2021 05:26:06 -0500 Subject: [PATCH 1/6] add encoding="utf-8" --- TTS/utils/io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/TTS/utils/io.py b/TTS/utils/io.py index 1148e0fe..30b7b7e2 100644 --- a/TTS/utils/io.py +++ b/TTS/utils/io.py @@ -22,7 +22,7 @@ class AttrDict(dict): def read_json_with_comments(json_path): # fallback to json - with open(json_path, "r") as f: + with open(json_path, "r", encoding = "utf-8") as f: input_str = f.read() # handle comments input_str = re.sub(r'\\\n', '', input_str) @@ -40,7 +40,7 @@ def load_config(config_path: str) -> AttrDict: ext = os.path.splitext(config_path)[1] if ext in (".yml", ".yaml"): - with open(config_path, "r") as f: + with open(config_path, "r", encoding = "utf-8") as f: data = yaml.safe_load(f) else: data = read_json_with_comments(config_path) @@ -61,7 +61,7 @@ def copy_model_files(c, config_file, out_path, new_fields): """ # copy config.json copy_config_path = os.path.join(out_path, 'config.json') - config_lines = open(config_file, "r").readlines() + config_lines = open(config_file, "r", encoding = "utf-8").readlines() # add extra information fields for key, value in new_fields.items(): if isinstance(value, str): From 9cb02aeea78826d8085b04eb34c237ab4c006264 Mon Sep 17 00:00:00 2001 From: kirianguiller Date: Mon, 15 Feb 2021 16:04:47 +0100 Subject: [PATCH 2/6] Chinese mandarin implementation (tacotron2) --- TTS/.models.json | 10 + TTS/tts/datasets/preprocess.py | 16 + TTS/tts/utils/chinese_mandarin/__init__.py | 0 TTS/tts/utils/chinese_mandarin/numbers.py | 107 ++++ TTS/tts/utils/chinese_mandarin/phonemizer.py | 41 ++ .../chinese_mandarin/pinyinToPhonemes.py | 420 ++++++++++++++ TTS/tts/utils/synthesis.py | 1 + TTS/tts/utils/text/__init__.py | 18 +- TTS/tts/utils/text/cleaners.py | 9 + TTS/utils/synthesizer.py | 9 +- ...on2_TTS_and_MultiBand_MelGAN_Example.ipynb | 529 ++++++++++++++++++ 11 files changed, 1158 insertions(+), 2 deletions(-) create mode 100644 TTS/tts/utils/chinese_mandarin/__init__.py create mode 100644 TTS/tts/utils/chinese_mandarin/numbers.py create mode 100644 TTS/tts/utils/chinese_mandarin/phonemizer.py create mode 100644 TTS/tts/utils/chinese_mandarin/pinyinToPhonemes.py create mode 100644 notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb diff --git a/TTS/.models.json b/TTS/.models.json index 05997461..0fb187a4 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -75,6 +75,16 @@ "contact":"erengolge@gmail.com" } } + }, + "zh":{ + "baker":{ + "tacotron2-DDC-GST":{ + "model_file": "1RR9rZdV_FMm8yvtCHALtUbJf1nxbUiAw", + "config_file": "1daY1JHGXEozJ-MGYLiWEUmzEwEvM5xpz", + "stats_file": "1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV", + "commit": "" + } + } } }, "vocoder_models":{ diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 7815d87d..be479376 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -352,3 +352,19 @@ def _voxcel_x(root_path, meta_file, voxcel_idx): with open(str(cache_to), 'r') as f: return [x.strip().split('|') for x in f.readlines()] + + + + +# ======================================== Baker (chinese mandarin single speaker) =========================================== +def baker(root_path, meta_file): + """Normalizes the Baker meta data file to TTS format""" + txt_file = os.path.join(root_path, meta_file) + items = [] + speaker_name = "baker" + with open(txt_file, 'r') as ttf: + for line in ttf: + wav_name, text = line.rstrip('\n').split("|") + wav_path = os.path.join(root_path, "clips_22", wav_name) + items.append([text, wav_path, speaker_name]) + return items diff --git a/TTS/tts/utils/chinese_mandarin/__init__.py b/TTS/tts/utils/chinese_mandarin/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/TTS/tts/utils/chinese_mandarin/numbers.py b/TTS/tts/utils/chinese_mandarin/numbers.py new file mode 100644 index 00000000..8d2f40ff --- /dev/null +++ b/TTS/tts/utils/chinese_mandarin/numbers.py @@ -0,0 +1,107 @@ + +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Licensed under WTFPL or the Unlicense or CC0. +# This uses Python 3, but it's easy to port to Python 2 by changing +# strings to u'xx'. + +import re +import itertools + + +def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False): + """ + Converts numbers to Chinese representations. + `big` : use financial characters. + `simp` : use simplified characters instead of traditional characters. + `o` : use 〇 for zero. + `twoalt`: use 两/兩 for two when appropriate. + Note that `o` and `twoalt` is ignored when `big` is used, + and `twoalt` is ignored when `o` is used for formal representations. + """ + # check num first + nd = str(num) + if abs(float(nd)) >= 1e48: + raise ValueError('number out of range') + elif 'e' in nd: + raise ValueError('scientific notation is not supported') + c_symbol = '正负点' if simp else '正負點' + if o: # formal + twoalt = False + if big: + c_basic = '零壹贰叁肆伍陆柒捌玖' if simp else '零壹貳參肆伍陸柒捌玖' + c_unit1 = '拾佰仟' + c_twoalt = '贰' if simp else '貳' + else: + c_basic = '〇一二三四五六七八九' if o else '零一二三四五六七八九' + c_unit1 = '十百千' + if twoalt: + c_twoalt = '两' if simp else '兩' + else: + c_twoalt = '二' + c_unit2 = '万亿兆京垓秭穰沟涧正载' if simp else '萬億兆京垓秭穰溝澗正載' + revuniq = lambda l: ''.join(k for k, g in itertools.groupby(reversed(l))) + nd = str(num) + result = [] + if nd[0] == '+': + result.append(c_symbol[0]) + elif nd[0] == '-': + result.append(c_symbol[1]) + if '.' in nd: + integer, remainder = nd.lstrip('+-').split('.') + else: + integer, remainder = nd.lstrip('+-'), None + if int(integer): + splitted = [integer[max(i - 4, 0):i] + for i in range(len(integer), 0, -4)] + intresult = [] + for nu, unit in enumerate(splitted): + # special cases + if int(unit) == 0: # 0000 + intresult.append(c_basic[0]) + continue + elif nu > 0 and int(unit) == 2: # 0002 + intresult.append(c_twoalt + c_unit2[nu - 1]) + continue + ulist = [] + unit = unit.zfill(4) + for nc, ch in enumerate(reversed(unit)): + if ch == '0': + if ulist: # ???0 + ulist.append(c_basic[0]) + elif nc == 0: + ulist.append(c_basic[int(ch)]) + elif nc == 1 and ch == '1' and unit[1] == '0': + # special case for tens + # edit the 'elif' if you don't like + # 十四, 三千零十四, 三千三百一十四 + ulist.append(c_unit1[0]) + elif nc > 1 and ch == '2': + ulist.append(c_twoalt + c_unit1[nc - 1]) + else: + ulist.append(c_basic[int(ch)] + c_unit1[nc - 1]) + ustr = revuniq(ulist) + if nu == 0: + intresult.append(ustr) + else: + intresult.append(ustr + c_unit2[nu - 1]) + result.append(revuniq(intresult).strip(c_basic[0])) + else: + result.append(c_basic[0]) + if remainder: + result.append(c_symbol[2]) + result.append(''.join(c_basic[int(ch)] for ch in remainder)) + return ''.join(result) + + + + +def _number_replace(match : re.Match): + match_str: str = match.group() + return _num2chinese(match_str) + + +def replace_numbers_to_characters_in_text(text : str): + text = re.sub(r'[0-9]+', _number_replace, text) + return text \ No newline at end of file diff --git a/TTS/tts/utils/chinese_mandarin/phonemizer.py b/TTS/tts/utils/chinese_mandarin/phonemizer.py new file mode 100644 index 00000000..7742c491 --- /dev/null +++ b/TTS/tts/utils/chinese_mandarin/phonemizer.py @@ -0,0 +1,41 @@ +from typing import List + +import pypinyin + +from .pinyinToPhonemes import PINYIN_DICT + + +import jieba + + +def _chinese_character_to_pinyin(text: str) -> List[str]: + pinyins = pypinyin.pinyin( + text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True + ) + pinyins_flat_list = [item for sublist in pinyins for item in sublist] + return pinyins_flat_list + + +def _chinese_pinyin_to_phoneme(pinyin: str) -> str: + segment = pinyin[:-1] + tone = pinyin[-1] + phoneme = PINYIN_DICT.get(segment, [""])[0] + return phoneme + tone + + +def chinese_text_to_phonemes(text: str) -> str: + tokenized_text = jieba.cut(text, HMM=False) + tokenized_text = " ".join(tokenized_text) + pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text) + + results: List[str] = [] + + for token in pinyined_text: + if token[-1] in "12345": # TODO transform to is_pinyin() + pinyin_phonemes = _chinese_pinyin_to_phoneme(token) + + results += list(pinyin_phonemes) + else: # is ponctuation or other + results += list(token) + + return "|".join(results) diff --git a/TTS/tts/utils/chinese_mandarin/pinyinToPhonemes.py b/TTS/tts/utils/chinese_mandarin/pinyinToPhonemes.py new file mode 100644 index 00000000..cdca44ac --- /dev/null +++ b/TTS/tts/utils/chinese_mandarin/pinyinToPhonemes.py @@ -0,0 +1,420 @@ + +PINYIN_DICT = { + "a": ["a"], + "ai": ["ai"], + "an": ["an"], + "ang": ["ɑŋ"], + "ao": ["aʌ"], + "ba": ["ba"], + "bai": ["bai"], + "ban": ["ban"], + "bang": ["bɑŋ"], + "bao": ["baʌ"], + # "be": ["be"], doesnt exist + "bei": ["bɛi"], + "ben": ["bœn"], + "beng": ["bɵŋ"], + "bi": ["bi"], + "bian": ["biɛn"], + "biao": ["biaʌ"], + "bie": ["bie"], + "bin": ["bin"], + "bing": ["bɨŋ"], + "bo": ["bo"], + "bu": ["bu"], + "ca": ["tsa"], + "cai": ["tsai"], + "can": ["tsan"], + "cang": ["tsɑŋ"], + "cao": ["tsaʌ"], + "ce": ["tsø"], + "cen": ["tsœn"], + "ceng": ["tsɵŋ"], + "cha": ["ʈʂa"], + "chai": ["ʈʂai"], + "chan": ["ʈʂan"], + "chang": ["ʈʂɑŋ"], + "chao": ["ʈʂaʌ"], + "che": ["ʈʂø"], + "chen": ["ʈʂœn"], + "cheng": ["ʈʂɵŋ"], + "chi": ["ʈʂʏ"], + "chong": ["ʈʂoŋ"], + "chou": ["ʈʂou"], + "chu": ["ʈʂu"], + "chua": ["ʈʂua"], + "chuai": ["ʈʂuai"], + "chuan": ["ʈʂuan"], + "chuang": ["ʈʂuɑŋ"], + "chui": ["ʈʂuei"], + "chun": ["ʈʂun"], + "chuo": ["ʈʂuo"], + "ci": ["tsɪ"], + "cong": ["tsoŋ"], + "cou": ["tsou"], + "cu": ["tsu"], + "cuan": ["tsuan"], + "cui": ["tsuei"], + "cun": ["tsun"], + "cuo": ["tsuo"], + "da": ["da"], + "dai": ["dai"], + "dan": ["dan"], + "dang": ["dɑŋ"], + "dao": ["daʌ"], + "de": ["dø"], + "dei": ["dei"], + # "den": ["dœn"], + "deng": ["dɵŋ"], + "di": ["di"], + "dia": ["dia"], + "dian": ["diɛn"], + "diao": ["diaʌ"], + "die": ["die"], + "ding": ["dɨŋ"], + "diu": ["dio"], + "dong": ["doŋ"], + "dou": ["dou"], + "du": ["du"], + "duan": ["duan"], + "dui": ["duei"], + "dun": ["dun"], + "duo": ["duo"], + "e": ["ø"], + "ei": ["ei"], + "en": ["œn"], + # "ng": ["œn"], + # "eng": ["ɵŋ"], + "er": ["er"], + "fa": ["fa"], + "fan": ["fan"], + "fang": ["fɑŋ"], + "fei": ["fei"], + "fen": ["fœn"], + "feng": ["fɵŋ"], + "fo": ["fo"], + "fou": ["fou"], + "fu": ["fu"], + "ga": ["ga"], + "gai": ["gai"], + "gan": ["gan"], + "gang": ["gɑŋ"], + "gao": ["gaʌ"], + "ge": ["gø"], + "gei": ["gei"], + "gen": ["gœn"], + "geng": ["gɵŋ"], + "gong": ["goŋ"], + "gou": ["gou"], + "gu": ["gu"], + "gua": ["gua"], + "guai": ["guai"], + "guan": ["guan"], + "guang": ["guɑŋ"], + "gui": ["guei"], + "gun": ["gun"], + "guo": ["guo"], + "ha": ["xa"], + "hai": ["xai"], + "han": ["xan"], + "hang": ["xɑŋ"], + "hao": ["xaʌ"], + "he": ["xø"], + "hei": ["xei"], + "hen": ["xœn"], + "heng": ["xɵŋ"], + "hong": ["xoŋ"], + "hou": ["xou"], + "hu": ["xu"], + "hua": ["xua"], + "huai": ["xuai"], + "huan": ["xuan"], + "huang": ["xuɑŋ"], + "hui": ["xuei"], + "hun": ["xun"], + "huo": ["xuo"], + "ji": ["dʑi"], + "jia": ["dʑia"], + "jian": ["dʑiɛn"], + "jiang": ["dʑiɑŋ"], + "jiao": ["dʑiaʌ"], + "jie": ["dʑie"], + "jin": ["dʑin"], + "jing": ["dʑɨŋ"], + "jiong": ["dʑioŋ"], + "jiu": ["dʑio"], + "ju": ["dʑy"], + "juan": ["dʑyɛn"], + "jue": ["dʑye"], + "jun": ["dʑyn"], + "ka": ["ka"], + "kai": ["kai"], + "kan": ["kan"], + "kang": ["kɑŋ"], + "kao": ["kaʌ"], + "ke": ["kø"], + "kei": ["kei"], + "ken": ["kœn"], + "keng": ["kɵŋ"], + "kong": ["koŋ"], + "kou": ["kou"], + "ku": ["ku"], + "kua": ["kua"], + "kuai": ["kuai"], + "kuan": ["kuan"], + "kuang": ["kuɑŋ"], + "kui": ["kuei"], + "kun": ["kun"], + "kuo": ["kuo"], + "la": ["la"], + "lai": ["lai"], + "lan": ["lan"], + "lang": ["lɑŋ"], + "lao": ["laʌ"], + "le": ["lø"], + "lei": ["lei"], + "leng": ["lɵŋ"], + "li": ["li"], + "lia": ["lia"], + "lian": ["liɛn"], + "liang": ["liɑŋ"], + "liao": ["liaʌ"], + "lie": ["lie"], + "lin": ["lin"], + "ling": ["lɨŋ"], + "liu": ["lio"], + "lo": ["lo"], + "long": ["loŋ"], + "lou": ["lou"], + "lu": ["lu"], + "lv": ["ly"], + "luan": ["luan"], + "lve": ["lye"], + "lue": ["lue"], + "lun": ["lun"], + "luo": ["luo"], + "ma": ["ma"], + "mai": ["mai"], + "man": ["man"], + "mang": ["mɑŋ"], + "mao": ["maʌ"], + "me": ["mø"], + "mei": ["mei"], + "men": ["mœn"], + "meng": ["mɵŋ"], + "mi": ["mi"], + "mian": ["miɛn"], + "miao": ["miaʌ"], + "mie": ["mie"], + "min": ["min"], + "ming": ["mɨŋ"], + "miu": ["mio"], + "mo": ["mo"], + "mou": ["mou"], + "mu": ["mu"], + "na": ["na"], + "nai": ["nai"], + "nan": ["nan"], + "nang": ["nɑŋ"], + "nao": ["naʌ"], + "ne": ["nø"], + "nei": ["nei"], + "nen": ["nœn"], + "neng": ["nɵŋ"], + "ni": ["ni"], + "nia": ["nia"], + "nian": ["niɛn"], + "niang": ["niɑŋ"], + "niao": ["niaʌ"], + "nie": ["nie"], + "nin": ["nin"], + "ning": ["nɨŋ"], + "niu": ["nio"], + "nong": ["noŋ"], + "nou": ["nou"], + "nu": ["nu"], + "nv": ["ny"], + "nuan": ["nuan"], + "nve": ["nye"], + "nue": ["nye"], + "nuo": ["nuo"], + "o": ["o"], + "ou": ["ou"], + "pa": ["pa"], + "pai": ["pai"], + "pan": ["pan"], + "pang": ["pɑŋ"], + "pao": ["paʌ"], + "pe": ["pø"], + "pei": ["pei"], + "pen": ["pœn"], + "peng": ["pɵŋ"], + "pi": ["pi"], + "pian": ["piɛn"], + "piao": ["piaʌ"], + "pie": ["pie"], + "pin": ["pin"], + "ping": ["pɨŋ"], + "po": ["po"], + "pou": ["pou"], + "pu": ["pu"], + "qi": ["tɕi"], + "qia": ["tɕia"], + "qian": ["tɕiɛn"], + "qiang": ["tɕiɑŋ"], + "qiao": ["tɕiaʌ"], + "qie": ["tɕie"], + "qin": ["tɕin"], + "qing": ["tɕɨŋ"], + "qiong": ["tɕioŋ"], + "qiu": ["tɕio"], + "qu": ["tɕy"], + "quan": ["tɕyɛn"], + "que": ["tɕye"], + "qun": ["tɕyn"], + "ran": ["ʐan"], + "rang": ["ʐɑŋ"], + "rao": ["ʐaʌ"], + "re": ["ʐø"], + "ren": ["ʐœn"], + "reng": ["ʐɵŋ"], + "ri": ["ʐʏ"], + "rong": ["ʐoŋ"], + "rou": ["ʐou"], + "ru": ["ʐu"], + "rua": ["ʐua"], + "ruan": ["ʐuan"], + "rui": ["ʐuei"], + "run": ["ʐun"], + "ruo": ["ʐuo"], + "sa": ["sa"], + "sai": ["sai"], + "san": ["san"], + "sang": ["sɑŋ"], + "sao": ["saʌ"], + "se": ["sø"], + "sen": ["sœn"], + "seng": ["sɵŋ"], + "sha": ["ʂa"], + "shai": ["ʂai"], + "shan": ["ʂan"], + "shang": ["ʂɑŋ"], + "shao": ["ʂaʌ"], + "she": ["ʂø"], + "shei": ["ʂei"], + "shen": ["ʂœn"], + "sheng": ["ʂɵŋ"], + "shi": ["ʂʏ"], + "shou": ["ʂou"], + "shu": ["ʂu"], + "shua": ["ʂua"], + "shuai": ["ʂuai"], + "shuan": ["ʂuan"], + "shuang": ["ʂuɑŋ"], + "shui": ["ʂuei"], + "shun": ["ʂun"], + "shuo": ["ʂuo"], + "si": ["sɪ"], + "song": ["soŋ"], + "sou": ["sou"], + "su": ["su"], + "suan": ["suan"], + "sui": ["suei"], + "sun": ["sun"], + "suo": ["suo"], + "ta": ["ta"], + "tai": ["tai"], + "tan": ["tan"], + "tang": ["tɑŋ"], + "tao": ["taʌ"], + "te": ["tø"], + "tei": ["tei"], + "teng": ["tɵŋ"], + "ti": ["ti"], + "tian": ["tiɛn"], + "tiao": ["tiaʌ"], + "tie": ["tie"], + "ting": ["tɨŋ"], + "tong": ["toŋ"], + "tou": ["tou"], + "tu": ["tu"], + "tuan": ["tuan"], + "tui": ["tuei"], + "tun": ["tun"], + "tuo": ["tuo"], + "wa": ["wa"], + "wai": ["wai"], + "wan": ["wan"], + "wang": ["wɑŋ"], + "wei": ["wei"], + "wen": ["wœn"], + "weng": ["wɵŋ"], + "wo": ["wo"], + "wu": ["wu"], + "xi": ["ɕi"], + "xia": ["ɕia"], + "xian": ["ɕiɛn"], + "xiang": ["ɕiɑŋ"], + "xiao": ["ɕiaʌ"], + "xie": ["ɕie"], + "xin": ["ɕin"], + "xing": ["ɕɨŋ"], + "xiong": ["ɕioŋ"], + "xiu": ["ɕio"], + "xu": ["ɕy"], + "xuan": ["ɕyɛn"], + "xue": ["ɕye"], + "xun": ["ɕyn"], + "ya": ["ia"], + "yan": ["iɛn"], + "yang": ["iɑŋ"], + "yao": ["iaʌ"], + "ye": ["ie"], + "yi": ["i"], + "yin": ["in"], + "ying": ["ɨŋ"], + "yo": ["io"], + "yong": ["ioŋ"], + "you": ["io"], + "yu": ["y"], + "yuan": ["yɛn"], + "yue": ["ye"], + "yun": ["yn"], + "za": ["dza"], + "zai": ["dzai"], + "zan": ["dzan"], + "zang": ["dzɑŋ"], + "zao": ["dzaʌ"], + "ze": ["dzø"], + "zei": ["dzei"], + "zen": ["dzœn"], + "zeng": ["dzɵŋ"], + "zha": ["dʒa"], + "zhai": ["dʒai"], + "zhan": ["dʒan"], + "zhang": ["dʒɑŋ"], + "zhao": ["dʒaʌ"], + "zhe": ["dʒø"], + # "zhei": ["dʒei"], it doesn't exist + "zhen": ["dʒœn"], + "zheng": ["dʒɵŋ"], + "zhi": ["dʒʏ"], + "zhong": ["dʒoŋ"], + "zhou": ["dʒou"], + "zhu": ["dʒu"], + "zhua": ["dʒua"], + "zhuai": ["dʒuai"], + "zhuan": ["dʒuan"], + "zhuang": ["dʒuɑŋ"], + "zhui": ["dʒuei"], + "zhun": ["dʒun"], + "zhuo": ["dʒuo"], + "zi": ["dzɪ"], + "zong": ["dzoŋ"], + "zou": ["dzou"], + "zu": ["dzu"], + "zuan": ["dzuan"], + "zui": ["dzuei"], + "zun": ["dzun"], + "zuo": ["dzuo"], +} \ No newline at end of file diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index be587211..e7b1546e 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -219,6 +219,7 @@ def synthesis(model, ap (TTS.tts.utils.audio.AudioProcessor): audio processor to process model outputs. speaker_id (int): id of speaker + style_wav (str | Dict[str, float]): Uses for style embedding of GST. style_wav (str): Uses for style embedding of GST. truncated (bool): keep model states after inference. It can be used for continuous inference at long texts. diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 9771e691..16172596 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -8,6 +8,7 @@ from phonemizer.phonemize import phonemize from TTS.tts.utils.text import cleaners from TTS.tts.utils.text.symbols import (_bos, _eos, _punctuations, make_symbols, phonemes, symbols) +from TTS.tts.utils.chinese_mandarin.phonemizer import chinese_text_to_phonemes # pylint: disable=unnecessary-comprehension @@ -29,8 +30,23 @@ PHONEME_PUNCTUATION_PATTERN = r'['+_punctuations.replace(' ', '')+']+' def text2phone(text, language): ''' - Convert graphemes to phonemes. + Convert graphemes to phonemes. For most of the languages, it calls + the phonemizer python library that calls espeak/espeak-ng. For chinese + mandarin, it calls pypinyin + custom function for phonemizing + Parameters: + text (str): text to phonemize + language (str): language of the text + Returns: + ph (str): phonemes as a string seperated by "|" + ph = "ɪ|g|ˈ|z|æ|m|p|ə|l" ''' + + # TO REVIEW : How to have a good implementation for this? + if language == "chinese-mandarin": + ph = chinese_text_to_phonemes(text) + return ph + + seperator = phonemizer.separator.Separator(' |', '', '|') #try: punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text) diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 7c3f1017..49a25557 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -15,6 +15,8 @@ from unidecode import unidecode from .number_norm import normalize_numbers from .abbreviations import abbreviations_en, abbreviations_fr from .time import expand_time_english +from TTS.tts.utils.chinese_mandarin.numbers import replace_numbers_to_characters_in_text + # Regular expression matching whitespace: _whitespace_re = re.compile(r'\s+') @@ -122,6 +124,13 @@ def portuguese_cleaners(text): text = collapse_whitespace(text) return text +def chinese_mandarin_cleaners(text: str) -> str: + '''Basic pipeline for chinese''' + text = replace_numbers_to_characters_in_text(text) + return text + + + def phoneme_cleaners(text): '''Pipeline for phonemes mode, including number and abbreviation expansion.''' text = expand_numbers(text) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 2a779e53..4b4bc04c 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -122,6 +122,13 @@ class Synthesizer(object): speaker_embedding = self.init_speaker(speaker_idx) use_gl = self.vocoder_model is None + + # check if compute gst style + gst_style_input = None + if self.tts_config.use_gst: + if self.tts_config.gst["gst_style_input"] not in ["", {}]: + style_wav = self.tts_config.gst["gst_style_input"] + for sen in sens: # synthesize voice waveform, _, _, mel_postnet_spec, _, _ = synthesis( @@ -131,7 +138,7 @@ class Synthesizer(object): self.use_cuda, self.ap, speaker_idx, - None, + gst_style_input, False, self.tts_config.enable_eos_bos_chars, use_gl, diff --git a/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb b/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb new file mode 100644 index 00000000..709dbb8d --- /dev/null +++ b/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb @@ -0,0 +1,529 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "6LWsNd3_M3MP" + }, + "source": [ + "# Mozilla TTS on CPU Real-Time Speech Synthesis " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "FAqrSIWgLyP0" + }, + "source": [ + "We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n", + "\n", + "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n", + "\n", + "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n", + "\n", + "Note that both model performances can be improved with more training." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Ku-dA4DKoeXk" + }, + "source": [ + "### Download Models" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 162 + }, + "colab_type": "code", + "id": "jGIgnWhGsxU1", + "outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe", + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mkdir: cannot create directory 'data/': File exists\n", + "Downloading...\n", + "From: https://drive.google.com/uc?id=1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV\n", + "To: /root/projects/speech/mozilla-TTS_dev/notebooks/data/tts_scale_stats.npy\n", + "100%|██████████████████████████████████████| 10.5k/10.5k [00:00<00:00, 18.1MB/s]\n" + ] + } + ], + "source": [ + "! mkdir data/\n", + "! gdown --id 1RR9rZdV_FMm8yvtCHALtUbJf1nxbUiAw -O data/tts_model.pth.tar\n", + "! gdown --id 1daY1JHGXEozJ-MGYLiWEUmzEwEvM5xpz -O data/tts_config.json\n", + "! gdown --id 1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV -O data/tts_scale_stats.npy" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 235 + }, + "colab_type": "code", + "id": "4dnpE0-kvTsu", + "outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e", + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading...\n", + "From: https://drive.google.com/uc?id=11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU\n", + "To: /root/projects/speech/mozilla-TTS_dev/notebooks/data/vocoder_scale_stats.npy\n", + "100%|██████████████████████████████████████| 10.5k/10.5k [00:00<00:00, 16.7MB/s]\n" + ] + } + ], + "source": [ + "! gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O data/vocoder_model.pth.tar\n", + "! gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/vocoder_config.json\n", + "! gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/vocoder_scale_stats.npy" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Zlgi8fPdpRF0" + }, + "source": [ + "### Define TTS function" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "f-Yc42nQZG5A" + }, + "outputs": [], + "source": [ + "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True, style_wav=None):\n", + " t_1 = time.time()\n", + " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=style_wav,\n", + " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n", + " # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n", + " if not use_gl:\n", + " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", + " waveform = waveform.flatten()\n", + " if use_cuda:\n", + " waveform = waveform.cpu()\n", + " waveform = waveform.numpy()\n", + " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", + " tps = (time.time() - t_1) / len(waveform)\n", + " print(waveform.shape)\n", + " print(\" > Run-time: {}\".format(time.time() - t_1))\n", + " print(\" > Real-time factor: {}\".format(rtf))\n", + " print(\" > Time per step: {}\".format(tps))\n", + " IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n", + " return alignment, mel_postnet_spec, stop_tokens, waveform" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ZksegYQepkFg" + }, + "source": [ + "### Load Models" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "oVa0kOamprgj" + }, + "outputs": [], + "source": [ + "import os\n", + "import torch\n", + "import time\n", + "import IPython\n", + "\n", + "from TTS.tts.utils.generic_utils import setup_model\n", + "from TTS.utils.io import load_config\n", + "from TTS.tts.utils.text.symbols import symbols, phonemes, make_symbols\n", + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.tts.utils.synthesis import synthesis" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "EY-sHVO8IFSH" + }, + "outputs": [], + "source": [ + "# runtime settings\n", + "use_cuda = False" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "_1aIUp2FpxOQ" + }, + "outputs": [], + "source": [ + "# model paths\n", + "TTS_MODEL = \"/tank/models/tts/mozilla-TTS/tacotron2-DCC/chinese_mandarin/mandarin_dca_attn_gst_dcc-February-12-2021_03+13PM-5dbb48d/checkpoint_17000.pth.tar\"\n", + "TTS_CONFIG = \"/tank/models/tts/mozilla-TTS/tacotron2-DCC/chinese_mandarin/mandarin_dca_attn_gst_dcc-February-12-2021_03+13PM-5dbb48d/config.json\"\n", + "\n", + "TTS_MODEL = \"data/tts_model.pth.tar\"\n", + "TTS_CONFIG = \"data/tts_config.json\"\n", + "\n", + "VOCODER_MODEL = \"/root/.local/share/tts/vocoder_models--en--ljspeech--mulitband-melgan/model_file.pth.tar\"\n", + "VOCODER_CONFIG = \"/root/.local/share/tts/vocoder_models--en--ljspeech--mulitband-melgan/config.json\"\n", + "\n", + "VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n", + "VOCODER_CONFIG = \"data/vocoder_config.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "CpgmdBVQplbv" + }, + "outputs": [], + "source": [ + "# load configs\n", + "TTS_CONFIG = load_config(TTS_CONFIG)\n", + "VOCODER_CONFIG = load_config(VOCODER_CONFIG)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 471 + }, + "colab_type": "code", + "id": "zmrQxiozIUVE", + "outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49", + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Setting up Audio Processor...\n", + " | > sample_rate:22050\n", + " | > resample:False\n", + " | > num_mels:80\n", + " | > min_level_db:-100\n", + " | > frame_shift_ms:None\n", + " | > frame_length_ms:None\n", + " | > ref_level_db:0\n", + " | > fft_size:1024\n", + " | > power:1.5\n", + " | > preemphasis:0.0\n", + " | > griffin_lim_iters:60\n", + " | > signal_norm:True\n", + " | > symmetric_norm:True\n", + " | > mel_fmin:50.0\n", + " | > mel_fmax:7600.0\n", + " | > spec_gain:1.0\n", + " | > stft_pad_mode:reflect\n", + " | > max_norm:4.0\n", + " | > clip_norm:True\n", + " | > do_trim_silence:True\n", + " | > trim_db:60\n", + " | > do_sound_norm:False\n", + " | > stats_path:data/tts_scale_stats.npy\n", + " | > hop_length:256\n", + " | > win_length:1024\n" + ] + } + ], + "source": [ + "# load the audio processor\n", + "TTS_CONFIG.audio['stats_path'] = 'data/tts_scale_stats.npy'\n", + "ap = AudioProcessor(**TTS_CONFIG.audio) " + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "colab_type": "code", + "id": "8fLoI4ipqMeS", + "outputId": "b789066e-e305-42ad-b3ca-eba8d9267382", + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Using model: tacotron2\n" + ] + } + ], + "source": [ + "# LOAD TTS MODEL\n", + "# multi speaker \n", + "speaker_id = None\n", + "speakers = []\n", + "\n", + "# load the model (chinese_mandarin special characters/punctuations are in the tts_config.json)\n", + "if TTS_CONFIG.get(\"characters\"):\n", + " _characters = TTS_CONFIG[\"characters\"][\"characters\"]\n", + " _phonemes = TTS_CONFIG[\"characters\"][\"phonemes\"]\n", + " _punctuations = TTS_CONFIG[\"characters\"][\"punctuations\"]\n", + " _pad = TTS_CONFIG[\"characters\"][\"pad\"]\n", + " _eos = TTS_CONFIG[\"characters\"][\"eos\"]\n", + " _bos = TTS_CONFIG[\"characters\"][\"bos\"]\n", + " \n", + " symbols, phonemes = make_symbols(_characters, _phonemes, punctuations= _punctuations, pad=_pad, eos=_eos, bos=_bos )\n", + "\n", + "num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n", + "model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n", + "\n", + "# load model state\n", + "cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n", + "\n", + "# load the model\n", + "model.load_state_dict(cp['model'])\n", + "if use_cuda:\n", + " model.cuda()\n", + "model.eval()\n", + "\n", + "# set model stepsize\n", + "if 'r' in cp:\n", + " model.decoder.set_r(cp['r'])" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "colab_type": "code", + "id": "zKoq0GgzqzhQ", + "outputId": "234efc61-f37a-40bc-95a3-b51896018ccb", + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " > Generator Model: multiband_melgan_generator\n", + " > Setting up Audio Processor...\n", + " | > sample_rate:22050\n", + " | > resample:False\n", + " | > num_mels:80\n", + " | > min_level_db:-100\n", + " | > frame_shift_ms:None\n", + " | > frame_length_ms:None\n", + " | > ref_level_db:0\n", + " | > fft_size:1024\n", + " | > power:None\n", + " | > preemphasis:0.0\n", + " | > griffin_lim_iters:None\n", + " | > signal_norm:True\n", + " | > symmetric_norm:True\n", + " | > mel_fmin:50.0\n", + " | > mel_fmax:7600.0\n", + " | > spec_gain:1.0\n", + " | > stft_pad_mode:reflect\n", + " | > max_norm:4.0\n", + " | > clip_norm:True\n", + " | > do_trim_silence:True\n", + " | > trim_db:60\n", + " | > do_sound_norm:False\n", + " | > stats_path:data/vocoder_scale_stats.npy\n", + " | > hop_length:256\n", + " | > win_length:1024\n", + "\n", + "Vocoder loaded\n" + ] + } + ], + "source": [ + "from TTS.vocoder.utils.generic_utils import setup_generator\n", + "\n", + "# LOAD VOCODER MODEL\n", + "vocoder_model = setup_generator(VOCODER_CONFIG)\n", + "vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n", + "vocoder_model.remove_weight_norm()\n", + "vocoder_model.inference_padding = 0\n", + "\n", + "\n", + "VOCODER_CONFIG.audio['stats_path'] = 'data/vocoder_scale_stats.npy'\n", + "ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n", + "if use_cuda:\n", + " vocoder_model.cuda()\n", + "vocoder_model.eval()\n", + "print(\"\\nVocoder loaded\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Ws_YkPKsLgo-" + }, + "source": [ + "## Run Inference" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "# Here some test sentences for you to play with :\n", + "sentence = \"我从来不会说很标准的中文。\"\n", + "sentence = \"我喜欢听人工智能的博客。\"\n", + "sentence = \"我来自一个法国郊区的地方。\"\n", + "sentence = \"不比不知道,一比吓一跳!\"\n", + "sentence = \"台湾是一个真的很好玩的地方!\"\n", + "sentence = \"干一行,行一行,行行都行。\"\n", + "sentence = \"我要盖被子,好尴尬!\"" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "# You can also play with the style_wav global style token. However, the lady speaking in the baker dataset\n", + "# has no emotion through all the sentences. It's hard to get some nice GST with this.\n", + "# That's also why adding \"!\" or \"?\" at the end of sentence change nothing. The dataset has no such prosody.\n", + "style_wav = {\"2\": 0.3, \"1\": -0.1}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 134 + }, + "colab_type": "code", + "id": "FuWxZ9Ey5Puj", + "outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91", + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(50688,)\n", + " > Run-time: 1.5945854187011719\n", + " > Real-time factor: 0.6935317513786934\n", + " > Time per step: 3.145291761617468e-05\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sentence = \"我喜欢听人工智能的博客。\"\n", + "style_wav = {\"2\": 0.2, \"7\": -0.1}\n", + "\n", + "align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True, style_wav= style_wav)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 3e59d3c28d918038c2e10cebcbc6613e675fc377 Mon Sep 17 00:00:00 2001 From: kirianguiller Date: Tue, 16 Feb 2021 17:25:18 +0100 Subject: [PATCH 3/6] modify according to PR reviews --- TTS/tts/datasets/preprocess.py | 12 ++++-- TTS/tts/utils/chinese_mandarin/numbers.py | 46 +++++++++++++++++------ TTS/tts/utils/synthesis.py | 1 - TTS/tts/utils/text/__init__.py | 15 ++++---- TTS/utils/synthesizer.py | 2 +- 5 files changed, 52 insertions(+), 24 deletions(-) diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index be479376..78bf14d1 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -3,6 +3,7 @@ from glob import glob import re import sys from pathlib import Path +from typing import List, Tuple from tqdm import tqdm @@ -355,10 +356,15 @@ def _voxcel_x(root_path, meta_file, voxcel_idx): +def baker(root_path: str, meta_file: str) -> List[List[str]]: + """Normalizes the Baker meta data file to TTS format -# ======================================== Baker (chinese mandarin single speaker) =========================================== -def baker(root_path, meta_file): - """Normalizes the Baker meta data file to TTS format""" + Args: + root_path (str): path to the baker dataset + meta_file (str): name of the meta dataset containing names of wav to select and the transcript of the sentence + Returns: + List[List[str]]: List of (text, wav_path, speaker_name) associated with each sentences + """ txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "baker" diff --git a/TTS/tts/utils/chinese_mandarin/numbers.py b/TTS/tts/utils/chinese_mandarin/numbers.py index 8d2f40ff..a662ea1c 100644 --- a/TTS/tts/utils/chinese_mandarin/numbers.py +++ b/TTS/tts/utils/chinese_mandarin/numbers.py @@ -10,16 +10,24 @@ import re import itertools -def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False): - """ - Converts numbers to Chinese representations. - `big` : use financial characters. - `simp` : use simplified characters instead of traditional characters. - `o` : use 〇 for zero. - `twoalt`: use 两/兩 for two when appropriate. - Note that `o` and `twoalt` is ignored when `big` is used, - and `twoalt` is ignored when `o` is used for formal representations. +def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False) -> str: + """Convert numerical arabic numbers (0->9) to chinese hanzi numbers (〇 -> 九) + + Args: + num (str): arabic number to convert + big (bool, optional): use financial characters. Defaults to False. + simp (bool, optional): use simplified characters instead of tradictional characters. Defaults to True. + o (bool, optional): use 〇 for 'zero'. Defaults to False. + twoalt (bool, optional): use 两/兩 for 'two' when appropriate. Defaults to False. + + Raises: + ValueError: if number is more than 1e48 + ValueError: if 'e' exposent in number + + Returns: + str: converted number as hanzi characters """ + # check num first nd = str(num) if abs(float(nd)) >= 1e48: @@ -97,11 +105,27 @@ def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False): -def _number_replace(match : re.Match): +def _number_replace(match: re.Match) -> str: + """function to apply in a match, transform all numbers in a match by chinese characters + + Args: + match (re.Match): numbers regex matches + + Returns: + str: replaced characters for the numbers + """ match_str: str = match.group() return _num2chinese(match_str) -def replace_numbers_to_characters_in_text(text : str): +def replace_numbers_to_characters_in_text(text: str) -> str: + """Replace all arabic numbers in a text by their equivalent in chinese characters (simplified) + + Args: + text (str): input text to transform + + Returns: + str: output text + """ text = re.sub(r'[0-9]+', _number_replace, text) return text \ No newline at end of file diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index e7b1546e..adbd0d20 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -220,7 +220,6 @@ def synthesis(model, model outputs. speaker_id (int): id of speaker style_wav (str | Dict[str, float]): Uses for style embedding of GST. - style_wav (str): Uses for style embedding of GST. truncated (bool): keep model states after inference. It can be used for continuous inference at long texts. enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence. diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 16172596..4f4a38ea 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -29,16 +29,15 @@ PHONEME_PUNCTUATION_PATTERN = r'['+_punctuations.replace(' ', '')+']+' def text2phone(text, language): - ''' - Convert graphemes to phonemes. For most of the languages, it calls + '''Convert graphemes to phonemes. For most of the languages, it calls the phonemizer python library that calls espeak/espeak-ng. For chinese mandarin, it calls pypinyin + custom function for phonemizing - Parameters: - text (str): text to phonemize - language (str): language of the text - Returns: - ph (str): phonemes as a string seperated by "|" - ph = "ɪ|g|ˈ|z|æ|m|p|ə|l" + Parameters: + text (str): text to phonemize + language (str): language of the text + Returns: + ph (str): phonemes as a string seperated by "|" + ph = "ɪ|g|ˈ|z|æ|m|p|ə|l" ''' # TO REVIEW : How to have a good implementation for this? diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 4b4bc04c..3e65e175 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -127,7 +127,7 @@ class Synthesizer(object): gst_style_input = None if self.tts_config.use_gst: if self.tts_config.gst["gst_style_input"] not in ["", {}]: - style_wav = self.tts_config.gst["gst_style_input"] + gst_style_input = self.tts_config.gst["gst_style_input"] for sen in sens: # synthesize voice From a9ea71c601a31256c6fa0d9e31de7d04cbbe0d8e Mon Sep 17 00:00:00 2001 From: kirianguiller Date: Tue, 16 Feb 2021 17:41:06 +0100 Subject: [PATCH 4/6] remove re.Match typing in '_number_replace()' --- TTS/tts/utils/chinese_mandarin/numbers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/utils/chinese_mandarin/numbers.py b/TTS/tts/utils/chinese_mandarin/numbers.py index a662ea1c..0befe6b1 100644 --- a/TTS/tts/utils/chinese_mandarin/numbers.py +++ b/TTS/tts/utils/chinese_mandarin/numbers.py @@ -105,7 +105,7 @@ def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False) -> str: -def _number_replace(match: re.Match) -> str: +def _number_replace(match) -> str: """function to apply in a match, transform all numbers in a match by chinese characters Args: From fe049cb48091562e13a0294d4df730d21816d105 Mon Sep 17 00:00:00 2001 From: kirianguiller Date: Tue, 16 Feb 2021 17:53:39 +0100 Subject: [PATCH 5/6] add pypinyin and jieba to requierements.txt (chinese implementation) --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 7a0d9f76..659fe787 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,8 @@ numba==0.48 librosa==0.7.2 phonemizer>=2.2.0 unidecode==0.4.20 +pypinyin +jieba tensorboardX matplotlib Pillow From 45435624678fb756406e97f881f0bbc785748e6e Mon Sep 17 00:00:00 2001 From: kirianguiller Date: Tue, 16 Feb 2021 20:23:02 +0100 Subject: [PATCH 6/6] remove gst handling in synthetizer.py class --- TTS/utils/synthesizer.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 3e65e175..2a779e53 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -122,13 +122,6 @@ class Synthesizer(object): speaker_embedding = self.init_speaker(speaker_idx) use_gl = self.vocoder_model is None - - # check if compute gst style - gst_style_input = None - if self.tts_config.use_gst: - if self.tts_config.gst["gst_style_input"] not in ["", {}]: - gst_style_input = self.tts_config.gst["gst_style_input"] - for sen in sens: # synthesize voice waveform, _, _, mel_postnet_spec, _, _ = synthesis( @@ -138,7 +131,7 @@ class Synthesizer(object): self.use_cuda, self.ap, speaker_idx, - gst_style_input, + None, False, self.tts_config.enable_eos_bos_chars, use_gl,