<add> Chinese mandarin implementation (tacotron2)

2021-02-15 16:04:47 +01:00 · 2021-02-15 16:04:47 +01:00 · c4c7bc1b88
parent eb543c027e
commit c4c7bc1b88
11 changed files with 1158 additions and 2 deletions
--- a/TTS/.models.json
+++ b/TTS/.models.json
@ -75,6 +75,16 @@
                    "contact":"erengolge@gmail.com"
                }
            }
        },
        "zh":{
            "baker":{
                "tacotron2-DDC-GST":{
                    "model_file": "1RR9rZdV_FMm8yvtCHALtUbJf1nxbUiAw",
                    "config_file": "1daY1JHGXEozJ-MGYLiWEUmzEwEvM5xpz",
                    "stats_file": "1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV",
                    "commit": ""
                }
            }
        }
    },
    "vocoder_models":{
--- a/TTS/tts/datasets/preprocess.py
+++ b/TTS/tts/datasets/preprocess.py
@ -352,3 +352,19 @@ def _voxcel_x(root_path, meta_file, voxcel_idx):
    with open(str(cache_to), 'r') as f:
        return [x.strip().split('|') for x in f.readlines()]
 # ======================================== Baker (chinese mandarin single speaker) ===========================================
 def baker(root_path, meta_file):
    """Normalizes the Baker meta data file to TTS format"""
    txt_file = os.path.join(root_path, meta_file)
    items = []
    speaker_name = "baker"
    with open(txt_file, 'r') as ttf:
        for line in ttf:
            wav_name, text = line.rstrip('\n').split("|")
            wav_path = os.path.join(root_path, "clips_22", wav_name)
            items.append([text, wav_path, speaker_name])
    return items 
--- a/TTS/tts/utils/chinese_mandarin/init.py
+++ b/TTS/tts/utils/chinese_mandarin/init.py
--- a/TTS/tts/utils/chinese_mandarin/numbers.py
+++ b/TTS/tts/utils/chinese_mandarin/numbers.py
@ -0,0 +1,107 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 # Licensed under WTFPL or the Unlicense or CC0.
 # This uses Python 3, but it's easy to port to Python 2 by changing
 # strings to u'xx'.
 import re
 import itertools
 def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False):
    """
    Converts numbers to Chinese representations.
    `big`   : use financial characters.
    `simp`  : use simplified characters instead of traditional characters.
    `o`     : use 〇 for zero.
    `twoalt`: use 两/兩 for two when appropriate.
    Note that `o` and `twoalt` is ignored when `big` is used, 
    and `twoalt` is ignored when `o` is used for formal representations.
    """
    # check num first
    nd = str(num)
    if abs(float(nd)) >= 1e48:
        raise ValueError('number out of range')
    elif 'e' in nd:
        raise ValueError('scientific notation is not supported')
    c_symbol = '正负点' if simp else '正負點'
    if o:  # formal
        twoalt = False
    if big:
        c_basic = '零壹贰叁肆伍陆柒捌玖' if simp else '零壹貳參肆伍陸柒捌玖'
        c_unit1 = '拾佰仟'
        c_twoalt = '贰' if simp else '貳'
    else:
        c_basic = '〇一二三四五六七八九' if o else '零一二三四五六七八九'
        c_unit1 = '十百千'
        if twoalt:
            c_twoalt = '两' if simp else '兩'
        else:
            c_twoalt = '二'
    c_unit2 = '万亿兆京垓秭穰沟涧正载' if simp else '萬億兆京垓秭穰溝澗正載'
    revuniq = lambda l: ''.join(k for k, g in itertools.groupby(reversed(l)))
    nd = str(num)
    result = []
    if nd[0] == '+':
        result.append(c_symbol[0])
    elif nd[0] == '-':
        result.append(c_symbol[1])
    if '.' in nd:
        integer, remainder = nd.lstrip('+-').split('.')
    else:
        integer, remainder = nd.lstrip('+-'), None
    if int(integer):
        splitted = [integer[max(i - 4, 0):i]
                    for i in range(len(integer), 0, -4)]
        intresult = []
        for nu, unit in enumerate(splitted):
            # special cases
            if int(unit) == 0:  # 0000
                intresult.append(c_basic[0])
                continue
            elif nu > 0 and int(unit) == 2:  # 0002
                intresult.append(c_twoalt + c_unit2[nu - 1])
                continue
            ulist = []
            unit = unit.zfill(4)
            for nc, ch in enumerate(reversed(unit)):
                if ch == '0':
                    if ulist:  # ???0
                        ulist.append(c_basic[0])
                elif nc == 0:
                    ulist.append(c_basic[int(ch)])
                elif nc == 1 and ch == '1' and unit[1] == '0':
                    # special case for tens
                    # edit the 'elif' if you don't like
                    # 十四, 三千零十四, 三千三百一十四
                    ulist.append(c_unit1[0])
                elif nc > 1 and ch == '2':
                    ulist.append(c_twoalt + c_unit1[nc - 1])
                else:
                    ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
            ustr = revuniq(ulist)
            if nu == 0:
                intresult.append(ustr)
            else:
                intresult.append(ustr + c_unit2[nu - 1])
        result.append(revuniq(intresult).strip(c_basic[0]))
    else:
        result.append(c_basic[0])
    if remainder:
        result.append(c_symbol[2])
        result.append(''.join(c_basic[int(ch)] for ch in remainder))
    return ''.join(result)
 def _number_replace(match : re.Match):
    match_str: str = match.group()
    return _num2chinese(match_str)
 def replace_numbers_to_characters_in_text(text : str):
    text = re.sub(r'[0-9]+', _number_replace, text)
    return text
--- a/TTS/tts/utils/chinese_mandarin/phonemizer.py
+++ b/TTS/tts/utils/chinese_mandarin/phonemizer.py
@ -0,0 +1,41 @@
 from typing import List
 import pypinyin
 from .pinyinToPhonemes import PINYIN_DICT
 import jieba
 def _chinese_character_to_pinyin(text: str) -> List[str]:
    pinyins = pypinyin.pinyin(
        text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True
    )
    pinyins_flat_list = [item for sublist in pinyins for item in sublist]
    return pinyins_flat_list
 def _chinese_pinyin_to_phoneme(pinyin: str) -> str:
    segment = pinyin[:-1]
    tone = pinyin[-1]
    phoneme = PINYIN_DICT.get(segment, [""])[0]
    return phoneme + tone
 def chinese_text_to_phonemes(text: str) -> str:
    tokenized_text = jieba.cut(text, HMM=False)
    tokenized_text = " ".join(tokenized_text)
    pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text)
    results: List[str] = []
    for token in pinyined_text:
        if token[-1] in "12345":  # TODO transform to is_pinyin()
            pinyin_phonemes = _chinese_pinyin_to_phoneme(token)
            results += list(pinyin_phonemes)
        else:  # is ponctuation or other
            results += list(token)
    return "|".join(results)
--- a/TTS/tts/utils/chinese_mandarin/pinyinToPhonemes.py
+++ b/TTS/tts/utils/chinese_mandarin/pinyinToPhonemes.py
@ -0,0 +1,420 @@
 PINYIN_DICT = {
    "a": ["a"],
    "ai": ["ai"],
    "an": ["an"],
    "ang": ["ɑŋ"],
    "ao": ["aʌ"],
    "ba": ["ba"],
    "bai": ["bai"],
    "ban": ["ban"],
    "bang": ["bɑŋ"],
    "bao": ["baʌ"], 
    # "be": ["be"], doesnt exist
    "bei": ["bɛi"],
    "ben": ["bœn"],
    "beng": ["bɵŋ"],
    "bi": ["bi"],
    "bian": ["biɛn"],
    "biao": ["biaʌ"],
    "bie": ["bie"],
    "bin": ["bin"],
    "bing": ["bɨŋ"],
    "bo": ["bo"],
    "bu": ["bu"],
    "ca": ["tsa"],
    "cai": ["tsai"],
    "can": ["tsan"],
    "cang": ["tsɑŋ"],
    "cao": ["tsaʌ"],
    "ce": ["tsø"],
    "cen": ["tsœn"],
    "ceng": ["tsɵŋ"],
    "cha": ["ʈʂa"],
    "chai": ["ʈʂai"],
    "chan": ["ʈʂan"],
    "chang": ["ʈʂɑŋ"],
    "chao": ["ʈʂaʌ"],
    "che": ["ʈʂø"],
    "chen": ["ʈʂœn"],
    "cheng": ["ʈʂɵŋ"],
    "chi": ["ʈʂʏ"],
    "chong": ["ʈʂoŋ"],
    "chou": ["ʈʂou"],
    "chu": ["ʈʂu"],
    "chua": ["ʈʂua"],
    "chuai": ["ʈʂuai"],
    "chuan": ["ʈʂuan"],
    "chuang": ["ʈʂuɑŋ"],
    "chui": ["ʈʂuei"],
    "chun": ["ʈʂun"],
    "chuo": ["ʈʂuo"],
    "ci": ["tsɪ"],
    "cong": ["tsoŋ"],
    "cou": ["tsou"],
    "cu": ["tsu"],
    "cuan": ["tsuan"],
    "cui": ["tsuei"],
    "cun": ["tsun"],
    "cuo": ["tsuo"],
    "da": ["da"],
    "dai": ["dai"],
    "dan": ["dan"],
    "dang": ["dɑŋ"],
    "dao": ["daʌ"],
    "de": ["dø"],
    "dei": ["dei"],
    # "den": ["dœn"],
    "deng": ["dɵŋ"],
    "di": ["di"],
    "dia": ["dia"],
    "dian": ["diɛn"],
    "diao": ["diaʌ"],
    "die": ["die"],
    "ding": ["dɨŋ"],
    "diu": ["dio"],
    "dong": ["doŋ"],
    "dou": ["dou"],
    "du": ["du"],
    "duan": ["duan"],
    "dui": ["duei"],
    "dun": ["dun"],
    "duo": ["duo"],
    "e": ["ø"],
    "ei": ["ei"],
    "en": ["œn"],
    # "ng": ["œn"],
    # "eng": ["ɵŋ"],
    "er": ["er"],
    "fa": ["fa"],
    "fan": ["fan"],
    "fang": ["fɑŋ"],
    "fei": ["fei"],
    "fen": ["fœn"],
    "feng": ["fɵŋ"],
    "fo": ["fo"],
    "fou": ["fou"],
    "fu": ["fu"],
    "ga": ["ga"],
    "gai": ["gai"],
    "gan": ["gan"],
    "gang": ["gɑŋ"],
    "gao": ["gaʌ"],
    "ge": ["gø"],
    "gei": ["gei"],
    "gen": ["gœn"],
    "geng": ["gɵŋ"],
    "gong": ["goŋ"],
    "gou": ["gou"],
    "gu": ["gu"],
    "gua": ["gua"],
    "guai": ["guai"],
    "guan": ["guan"],
    "guang": ["guɑŋ"],
    "gui": ["guei"],
    "gun": ["gun"],
    "guo": ["guo"],
    "ha": ["xa"],
    "hai": ["xai"],
    "han": ["xan"],
    "hang": ["xɑŋ"],
    "hao": ["xaʌ"],
    "he": ["xø"],
    "hei": ["xei"],
    "hen": ["xœn"],
    "heng": ["xɵŋ"],
    "hong": ["xoŋ"],
    "hou": ["xou"],
    "hu": ["xu"],
    "hua": ["xua"],
    "huai": ["xuai"],
    "huan": ["xuan"],
    "huang": ["xuɑŋ"],
    "hui": ["xuei"],
    "hun": ["xun"],
    "huo": ["xuo"],
    "ji": ["dʑi"],
    "jia": ["dʑia"],
    "jian": ["dʑiɛn"],
    "jiang": ["dʑiɑŋ"],
    "jiao": ["dʑiaʌ"],
    "jie": ["dʑie"],
    "jin": ["dʑin"],
    "jing": ["dʑɨŋ"],
    "jiong": ["dʑioŋ"],
    "jiu": ["dʑio"],
    "ju": ["dʑy"],
    "juan": ["dʑyɛn"],
    "jue": ["dʑye"],
    "jun": ["dʑyn"],
    "ka": ["ka"],
    "kai": ["kai"],
    "kan": ["kan"],
    "kang": ["kɑŋ"],
    "kao": ["kaʌ"],
    "ke": ["kø"],
    "kei": ["kei"],
    "ken": ["kœn"],
    "keng": ["kɵŋ"],
    "kong": ["koŋ"],
    "kou": ["kou"],
    "ku": ["ku"],
    "kua": ["kua"],
    "kuai": ["kuai"],
    "kuan": ["kuan"],
    "kuang": ["kuɑŋ"],
    "kui": ["kuei"],
    "kun": ["kun"],
    "kuo": ["kuo"],
    "la": ["la"],
    "lai": ["lai"],
    "lan": ["lan"],
    "lang": ["lɑŋ"],
    "lao": ["laʌ"],
    "le": ["lø"],
    "lei": ["lei"],
    "leng": ["lɵŋ"],
    "li": ["li"],
    "lia": ["lia"],
    "lian": ["liɛn"],
    "liang": ["liɑŋ"],
    "liao": ["liaʌ"],
    "lie": ["lie"],
    "lin": ["lin"],
    "ling": ["lɨŋ"],
    "liu": ["lio"],
    "lo": ["lo"],
    "long": ["loŋ"],
    "lou": ["lou"],
    "lu": ["lu"],
    "lv": ["ly"],
    "luan": ["luan"],
    "lve": ["lye"],
    "lue": ["lue"],
    "lun": ["lun"],
    "luo": ["luo"],
    "ma": ["ma"],
    "mai": ["mai"],
    "man": ["man"],
    "mang": ["mɑŋ"],
    "mao": ["maʌ"],
    "me": ["mø"],
    "mei": ["mei"],
    "men": ["mœn"],
    "meng": ["mɵŋ"],
    "mi": ["mi"],
    "mian": ["miɛn"],
    "miao": ["miaʌ"],
    "mie": ["mie"],
    "min": ["min"],
    "ming": ["mɨŋ"],
    "miu": ["mio"],
    "mo": ["mo"],
    "mou": ["mou"],
    "mu": ["mu"],
    "na": ["na"],
    "nai": ["nai"],
    "nan": ["nan"],
    "nang": ["nɑŋ"],
    "nao": ["naʌ"],
    "ne": ["nø"],
    "nei": ["nei"],
    "nen": ["nœn"],
    "neng": ["nɵŋ"],
    "ni": ["ni"],
    "nia": ["nia"],
    "nian": ["niɛn"],
    "niang": ["niɑŋ"],
    "niao": ["niaʌ"],
    "nie": ["nie"],
    "nin": ["nin"],
    "ning": ["nɨŋ"],
    "niu": ["nio"],
    "nong": ["noŋ"],
    "nou": ["nou"],
    "nu": ["nu"],
    "nv": ["ny"],
    "nuan": ["nuan"],
    "nve": ["nye"],
    "nue": ["nye"],
    "nuo": ["nuo"],
    "o": ["o"],
    "ou": ["ou"],
    "pa": ["pa"],
    "pai": ["pai"],
    "pan": ["pan"],
    "pang": ["pɑŋ"],
    "pao": ["paʌ"],
    "pe": ["pø"],
    "pei": ["pei"],
    "pen": ["pœn"],
    "peng": ["pɵŋ"],
    "pi": ["pi"],
    "pian": ["piɛn"],
    "piao": ["piaʌ"],
    "pie": ["pie"],
    "pin": ["pin"],
    "ping": ["pɨŋ"],
    "po": ["po"],
    "pou": ["pou"],
    "pu": ["pu"],
    "qi": ["tɕi"],
    "qia": ["tɕia"],
    "qian": ["tɕiɛn"],
    "qiang": ["tɕiɑŋ"],
    "qiao": ["tɕiaʌ"],
    "qie": ["tɕie"],
    "qin": ["tɕin"],
    "qing": ["tɕɨŋ"],
    "qiong": ["tɕioŋ"],
    "qiu": ["tɕio"],
    "qu": ["tɕy"],
    "quan": ["tɕyɛn"],
    "que": ["tɕye"],
    "qun": ["tɕyn"],
    "ran": ["ʐan"],
    "rang": ["ʐɑŋ"],
    "rao": ["ʐaʌ"],
    "re": ["ʐø"],
    "ren": ["ʐœn"],
    "reng": ["ʐɵŋ"],
    "ri": ["ʐʏ"],
    "rong": ["ʐoŋ"],
    "rou": ["ʐou"],
    "ru": ["ʐu"],
    "rua": ["ʐua"],
    "ruan": ["ʐuan"],
    "rui": ["ʐuei"],
    "run": ["ʐun"],
    "ruo": ["ʐuo"],
    "sa": ["sa"],
    "sai": ["sai"],
    "san": ["san"],
    "sang": ["sɑŋ"],
    "sao": ["saʌ"],
    "se": ["sø"],
    "sen": ["sœn"],
    "seng": ["sɵŋ"],
    "sha": ["ʂa"],
    "shai": ["ʂai"],
    "shan": ["ʂan"],
    "shang": ["ʂɑŋ"],
    "shao": ["ʂaʌ"],
    "she": ["ʂø"],
    "shei": ["ʂei"],
    "shen": ["ʂœn"],
    "sheng": ["ʂɵŋ"],
    "shi": ["ʂʏ"],
    "shou": ["ʂou"],
    "shu": ["ʂu"],
    "shua": ["ʂua"],
    "shuai": ["ʂuai"],
    "shuan": ["ʂuan"],
    "shuang": ["ʂuɑŋ"],
    "shui": ["ʂuei"],
    "shun": ["ʂun"],
    "shuo": ["ʂuo"],
    "si": ["sɪ"],
    "song": ["soŋ"],
    "sou": ["sou"],
    "su": ["su"],
    "suan": ["suan"],
    "sui": ["suei"],
    "sun": ["sun"],
    "suo": ["suo"],
    "ta": ["ta"],
    "tai": ["tai"],
    "tan": ["tan"],
    "tang": ["tɑŋ"],
    "tao": ["taʌ"],
    "te": ["tø"],
    "tei": ["tei"],
    "teng": ["tɵŋ"],
    "ti": ["ti"],
    "tian": ["tiɛn"],
    "tiao": ["tiaʌ"],
    "tie": ["tie"],
    "ting": ["tɨŋ"],
    "tong": ["toŋ"],
    "tou": ["tou"],
    "tu": ["tu"],
    "tuan": ["tuan"],
    "tui": ["tuei"],
    "tun": ["tun"],
    "tuo": ["tuo"],
    "wa": ["wa"],
    "wai": ["wai"],
    "wan": ["wan"],
    "wang": ["wɑŋ"],
    "wei": ["wei"],
    "wen": ["wœn"],
    "weng": ["wɵŋ"],
    "wo": ["wo"],
    "wu": ["wu"],
    "xi": ["ɕi"],
    "xia": ["ɕia"],
    "xian": ["ɕiɛn"],
    "xiang": ["ɕiɑŋ"],
    "xiao": ["ɕiaʌ"],
    "xie": ["ɕie"],
    "xin": ["ɕin"],
    "xing": ["ɕɨŋ"],
    "xiong": ["ɕioŋ"],
    "xiu": ["ɕio"],
    "xu": ["ɕy"],
    "xuan": ["ɕyɛn"],
    "xue": ["ɕye"],
    "xun": ["ɕyn"],
    "ya": ["ia"],
    "yan": ["iɛn"],
    "yang": ["iɑŋ"],
    "yao": ["iaʌ"],
    "ye": ["ie"],
    "yi": ["i"],
    "yin": ["in"],
    "ying": ["ɨŋ"],
    "yo": ["io"],
    "yong": ["ioŋ"],
    "you": ["io"],
    "yu": ["y"],
    "yuan": ["yɛn"], 
    "yue": ["ye"],
    "yun": ["yn"],
    "za": ["dza"],
    "zai": ["dzai"],
    "zan": ["dzan"],
    "zang": ["dzɑŋ"],
    "zao": ["dzaʌ"],
    "ze": ["dzø"],
    "zei": ["dzei"],
    "zen": ["dzœn"],
    "zeng": ["dzɵŋ"],
    "zha": ["dʒa"],
    "zhai": ["dʒai"],
    "zhan": ["dʒan"],
    "zhang": ["dʒɑŋ"],
    "zhao": ["dʒaʌ"],
    "zhe": ["dʒø"],
    # "zhei": ["dʒei"], it doesn't exist
    "zhen": ["dʒœn"],
    "zheng": ["dʒɵŋ"],
    "zhi": ["dʒʏ"],
    "zhong": ["dʒoŋ"],
    "zhou": ["dʒou"],
    "zhu": ["dʒu"],
    "zhua": ["dʒua"],
    "zhuai": ["dʒuai"],
    "zhuan": ["dʒuan"],
    "zhuang": ["dʒuɑŋ"],
    "zhui": ["dʒuei"],
    "zhun": ["dʒun"],
    "zhuo": ["dʒuo"],
    "zi": ["dzɪ"],
    "zong": ["dzoŋ"],
    "zou": ["dzou"],
    "zu": ["dzu"],
    "zuan": ["dzuan"],
    "zui": ["dzuei"],
    "zun": ["dzun"],
    "zuo": ["dzuo"],
 }
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@ -219,6 +219,7 @@ def synthesis(model,
            ap (TTS.tts.utils.audio.AudioProcessor): audio processor to process
                model outputs.
            speaker_id (int): id of speaker
            style_wav (str | Dict[str, float]): Uses for style embedding of GST.
            style_wav (str): Uses for style embedding of GST.
            truncated (bool): keep model states after inference. It can be used
                for continuous inference at long texts.
--- a/TTS/tts/utils/text/init.py
+++ b/TTS/tts/utils/text/init.py
@ -8,6 +8,7 @@ from phonemizer.phonemize import phonemize
 from TTS.tts.utils.text import cleaners
 from TTS.tts.utils.text.symbols import (_bos, _eos, _punctuations,
                                        make_symbols, phonemes, symbols)
 from TTS.tts.utils.chinese_mandarin.phonemizer import chinese_text_to_phonemes
 # pylint: disable=unnecessary-comprehension
@ -29,8 +30,23 @@ PHONEME_PUNCTUATION_PATTERN = r'['+_punctuations.replace(' ', '')+']+'
 def text2phone(text, language):
    '''
-    Convert graphemes to phonemes.
+    Convert graphemes to phonemes. For most of the languages, it calls
    the phonemizer python library that calls espeak/espeak-ng. For chinese
    mandarin, it calls pypinyin + custom function for phonemizing
            Parameters:
                    text (str): text to phonemize
                    language (str): language of the text
            Returns:
                    ph (str): phonemes as a string seperated by "|"
                            ph = "ɪ|g|ˈ|z|æ|m|p|ə|l"
    '''
    # TO REVIEW : How to have a good implementation for this?
    if language == "chinese-mandarin":
        ph = chinese_text_to_phonemes(text)
        return ph
    seperator = phonemizer.separator.Separator(' |', '', '|')
    #try:
    punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text)
--- a/TTS/tts/utils/text/cleaners.py
+++ b/TTS/tts/utils/text/cleaners.py
@ -15,6 +15,8 @@ from unidecode import unidecode
 from .number_norm import normalize_numbers
 from .abbreviations import abbreviations_en, abbreviations_fr
 from .time import expand_time_english
 from TTS.tts.utils.chinese_mandarin.numbers import replace_numbers_to_characters_in_text
 # Regular expression matching whitespace:
 _whitespace_re = re.compile(r'\s+')
@ -122,6 +124,13 @@ def portuguese_cleaners(text):
    text = collapse_whitespace(text)
    return text
 def chinese_mandarin_cleaners(text: str) -> str:
    '''Basic pipeline for chinese'''
    text = replace_numbers_to_characters_in_text(text)
    return text
 def phoneme_cleaners(text):
    '''Pipeline for phonemes mode, including number and abbreviation expansion.'''
    text = expand_numbers(text)
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@ -122,6 +122,13 @@ class Synthesizer(object):
        speaker_embedding = self.init_speaker(speaker_idx)
        use_gl = self.vocoder_model is None
        # check if compute gst style
        gst_style_input = None
        if self.tts_config.use_gst:
            if self.tts_config.gst["gst_style_input"] not in ["", {}]:
                style_wav = self.tts_config.gst["gst_style_input"]
        for sen in sens:
            # synthesize voice
            waveform, _, _, mel_postnet_spec, _, _ = synthesis(
@ -131,7 +138,7 @@ class Synthesizer(object):
                self.use_cuda,
                self.ap,
                speaker_idx,
-                None,
+                gst_style_input,
                False,
                self.tts_config.enable_eos_bos_chars,
                use_gl,
--- a/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb
+++ b/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb