diff --git a/TTS/.models.json b/TTS/.models.json
index 05997461..0fb187a4 100644
--- a/TTS/.models.json
+++ b/TTS/.models.json
@@ -75,6 +75,16 @@
"contact":"erengolge@gmail.com"
}
}
+ },
+ "zh":{
+ "baker":{
+ "tacotron2-DDC-GST":{
+ "model_file": "1RR9rZdV_FMm8yvtCHALtUbJf1nxbUiAw",
+ "config_file": "1daY1JHGXEozJ-MGYLiWEUmzEwEvM5xpz",
+ "stats_file": "1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV",
+ "commit": ""
+ }
+ }
}
},
"vocoder_models":{
diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py
index bed76c86..26c17174 100644
--- a/TTS/tts/datasets/preprocess.py
+++ b/TTS/tts/datasets/preprocess.py
@@ -3,6 +3,7 @@ from glob import glob
import re
import sys
from pathlib import Path
+from typing import List, Tuple
from tqdm import tqdm
@@ -368,3 +369,24 @@ def _voxcel_x(root_path, meta_file, voxcel_idx):
with open(str(cache_to), 'r') as f:
return [x.strip().split('|') for x in f.readlines()]
+
+
+
+def baker(root_path: str, meta_file: str) -> List[List[str]]:
+ """Normalizes the Baker meta data file to TTS format
+
+ Args:
+ root_path (str): path to the baker dataset
+ meta_file (str): name of the meta dataset containing names of wav to select and the transcript of the sentence
+ Returns:
+ List[List[str]]: List of (text, wav_path, speaker_name) associated with each sentences
+ """
+ txt_file = os.path.join(root_path, meta_file)
+ items = []
+ speaker_name = "baker"
+ with open(txt_file, 'r') as ttf:
+ for line in ttf:
+ wav_name, text = line.rstrip('\n').split("|")
+ wav_path = os.path.join(root_path, "clips_22", wav_name)
+ items.append([text, wav_path, speaker_name])
+ return items
diff --git a/TTS/tts/utils/chinese_mandarin/__init__.py b/TTS/tts/utils/chinese_mandarin/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/TTS/tts/utils/chinese_mandarin/numbers.py b/TTS/tts/utils/chinese_mandarin/numbers.py
new file mode 100644
index 00000000..0befe6b1
--- /dev/null
+++ b/TTS/tts/utils/chinese_mandarin/numbers.py
@@ -0,0 +1,131 @@
+
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Licensed under WTFPL or the Unlicense or CC0.
+# This uses Python 3, but it's easy to port to Python 2 by changing
+# strings to u'xx'.
+
+import re
+import itertools
+
+
+def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False) -> str:
+ """Convert numerical arabic numbers (0->9) to chinese hanzi numbers (〇 -> 九)
+
+ Args:
+ num (str): arabic number to convert
+ big (bool, optional): use financial characters. Defaults to False.
+ simp (bool, optional): use simplified characters instead of tradictional characters. Defaults to True.
+ o (bool, optional): use 〇 for 'zero'. Defaults to False.
+ twoalt (bool, optional): use 两/兩 for 'two' when appropriate. Defaults to False.
+
+ Raises:
+ ValueError: if number is more than 1e48
+ ValueError: if 'e' exposent in number
+
+ Returns:
+ str: converted number as hanzi characters
+ """
+
+ # check num first
+ nd = str(num)
+ if abs(float(nd)) >= 1e48:
+ raise ValueError('number out of range')
+ elif 'e' in nd:
+ raise ValueError('scientific notation is not supported')
+ c_symbol = '正负点' if simp else '正負點'
+ if o: # formal
+ twoalt = False
+ if big:
+ c_basic = '零壹贰叁肆伍陆柒捌玖' if simp else '零壹貳參肆伍陸柒捌玖'
+ c_unit1 = '拾佰仟'
+ c_twoalt = '贰' if simp else '貳'
+ else:
+ c_basic = '〇一二三四五六七八九' if o else '零一二三四五六七八九'
+ c_unit1 = '十百千'
+ if twoalt:
+ c_twoalt = '两' if simp else '兩'
+ else:
+ c_twoalt = '二'
+ c_unit2 = '万亿兆京垓秭穰沟涧正载' if simp else '萬億兆京垓秭穰溝澗正載'
+ revuniq = lambda l: ''.join(k for k, g in itertools.groupby(reversed(l)))
+ nd = str(num)
+ result = []
+ if nd[0] == '+':
+ result.append(c_symbol[0])
+ elif nd[0] == '-':
+ result.append(c_symbol[1])
+ if '.' in nd:
+ integer, remainder = nd.lstrip('+-').split('.')
+ else:
+ integer, remainder = nd.lstrip('+-'), None
+ if int(integer):
+ splitted = [integer[max(i - 4, 0):i]
+ for i in range(len(integer), 0, -4)]
+ intresult = []
+ for nu, unit in enumerate(splitted):
+ # special cases
+ if int(unit) == 0: # 0000
+ intresult.append(c_basic[0])
+ continue
+ elif nu > 0 and int(unit) == 2: # 0002
+ intresult.append(c_twoalt + c_unit2[nu - 1])
+ continue
+ ulist = []
+ unit = unit.zfill(4)
+ for nc, ch in enumerate(reversed(unit)):
+ if ch == '0':
+ if ulist: # ???0
+ ulist.append(c_basic[0])
+ elif nc == 0:
+ ulist.append(c_basic[int(ch)])
+ elif nc == 1 and ch == '1' and unit[1] == '0':
+ # special case for tens
+ # edit the 'elif' if you don't like
+ # 十四, 三千零十四, 三千三百一十四
+ ulist.append(c_unit1[0])
+ elif nc > 1 and ch == '2':
+ ulist.append(c_twoalt + c_unit1[nc - 1])
+ else:
+ ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
+ ustr = revuniq(ulist)
+ if nu == 0:
+ intresult.append(ustr)
+ else:
+ intresult.append(ustr + c_unit2[nu - 1])
+ result.append(revuniq(intresult).strip(c_basic[0]))
+ else:
+ result.append(c_basic[0])
+ if remainder:
+ result.append(c_symbol[2])
+ result.append(''.join(c_basic[int(ch)] for ch in remainder))
+ return ''.join(result)
+
+
+
+
+def _number_replace(match) -> str:
+ """function to apply in a match, transform all numbers in a match by chinese characters
+
+ Args:
+ match (re.Match): numbers regex matches
+
+ Returns:
+ str: replaced characters for the numbers
+ """
+ match_str: str = match.group()
+ return _num2chinese(match_str)
+
+
+def replace_numbers_to_characters_in_text(text: str) -> str:
+ """Replace all arabic numbers in a text by their equivalent in chinese characters (simplified)
+
+ Args:
+ text (str): input text to transform
+
+ Returns:
+ str: output text
+ """
+ text = re.sub(r'[0-9]+', _number_replace, text)
+ return text
\ No newline at end of file
diff --git a/TTS/tts/utils/chinese_mandarin/phonemizer.py b/TTS/tts/utils/chinese_mandarin/phonemizer.py
new file mode 100644
index 00000000..7742c491
--- /dev/null
+++ b/TTS/tts/utils/chinese_mandarin/phonemizer.py
@@ -0,0 +1,41 @@
+from typing import List
+
+import pypinyin
+
+from .pinyinToPhonemes import PINYIN_DICT
+
+
+import jieba
+
+
+def _chinese_character_to_pinyin(text: str) -> List[str]:
+ pinyins = pypinyin.pinyin(
+ text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True
+ )
+ pinyins_flat_list = [item for sublist in pinyins for item in sublist]
+ return pinyins_flat_list
+
+
+def _chinese_pinyin_to_phoneme(pinyin: str) -> str:
+ segment = pinyin[:-1]
+ tone = pinyin[-1]
+ phoneme = PINYIN_DICT.get(segment, [""])[0]
+ return phoneme + tone
+
+
+def chinese_text_to_phonemes(text: str) -> str:
+ tokenized_text = jieba.cut(text, HMM=False)
+ tokenized_text = " ".join(tokenized_text)
+ pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text)
+
+ results: List[str] = []
+
+ for token in pinyined_text:
+ if token[-1] in "12345": # TODO transform to is_pinyin()
+ pinyin_phonemes = _chinese_pinyin_to_phoneme(token)
+
+ results += list(pinyin_phonemes)
+ else: # is ponctuation or other
+ results += list(token)
+
+ return "|".join(results)
diff --git a/TTS/tts/utils/chinese_mandarin/pinyinToPhonemes.py b/TTS/tts/utils/chinese_mandarin/pinyinToPhonemes.py
new file mode 100644
index 00000000..cdca44ac
--- /dev/null
+++ b/TTS/tts/utils/chinese_mandarin/pinyinToPhonemes.py
@@ -0,0 +1,420 @@
+
+PINYIN_DICT = {
+ "a": ["a"],
+ "ai": ["ai"],
+ "an": ["an"],
+ "ang": ["ɑŋ"],
+ "ao": ["aʌ"],
+ "ba": ["ba"],
+ "bai": ["bai"],
+ "ban": ["ban"],
+ "bang": ["bɑŋ"],
+ "bao": ["baʌ"],
+ # "be": ["be"], doesnt exist
+ "bei": ["bɛi"],
+ "ben": ["bœn"],
+ "beng": ["bɵŋ"],
+ "bi": ["bi"],
+ "bian": ["biɛn"],
+ "biao": ["biaʌ"],
+ "bie": ["bie"],
+ "bin": ["bin"],
+ "bing": ["bɨŋ"],
+ "bo": ["bo"],
+ "bu": ["bu"],
+ "ca": ["tsa"],
+ "cai": ["tsai"],
+ "can": ["tsan"],
+ "cang": ["tsɑŋ"],
+ "cao": ["tsaʌ"],
+ "ce": ["tsø"],
+ "cen": ["tsœn"],
+ "ceng": ["tsɵŋ"],
+ "cha": ["ʈʂa"],
+ "chai": ["ʈʂai"],
+ "chan": ["ʈʂan"],
+ "chang": ["ʈʂɑŋ"],
+ "chao": ["ʈʂaʌ"],
+ "che": ["ʈʂø"],
+ "chen": ["ʈʂœn"],
+ "cheng": ["ʈʂɵŋ"],
+ "chi": ["ʈʂʏ"],
+ "chong": ["ʈʂoŋ"],
+ "chou": ["ʈʂou"],
+ "chu": ["ʈʂu"],
+ "chua": ["ʈʂua"],
+ "chuai": ["ʈʂuai"],
+ "chuan": ["ʈʂuan"],
+ "chuang": ["ʈʂuɑŋ"],
+ "chui": ["ʈʂuei"],
+ "chun": ["ʈʂun"],
+ "chuo": ["ʈʂuo"],
+ "ci": ["tsɪ"],
+ "cong": ["tsoŋ"],
+ "cou": ["tsou"],
+ "cu": ["tsu"],
+ "cuan": ["tsuan"],
+ "cui": ["tsuei"],
+ "cun": ["tsun"],
+ "cuo": ["tsuo"],
+ "da": ["da"],
+ "dai": ["dai"],
+ "dan": ["dan"],
+ "dang": ["dɑŋ"],
+ "dao": ["daʌ"],
+ "de": ["dø"],
+ "dei": ["dei"],
+ # "den": ["dœn"],
+ "deng": ["dɵŋ"],
+ "di": ["di"],
+ "dia": ["dia"],
+ "dian": ["diɛn"],
+ "diao": ["diaʌ"],
+ "die": ["die"],
+ "ding": ["dɨŋ"],
+ "diu": ["dio"],
+ "dong": ["doŋ"],
+ "dou": ["dou"],
+ "du": ["du"],
+ "duan": ["duan"],
+ "dui": ["duei"],
+ "dun": ["dun"],
+ "duo": ["duo"],
+ "e": ["ø"],
+ "ei": ["ei"],
+ "en": ["œn"],
+ # "ng": ["œn"],
+ # "eng": ["ɵŋ"],
+ "er": ["er"],
+ "fa": ["fa"],
+ "fan": ["fan"],
+ "fang": ["fɑŋ"],
+ "fei": ["fei"],
+ "fen": ["fœn"],
+ "feng": ["fɵŋ"],
+ "fo": ["fo"],
+ "fou": ["fou"],
+ "fu": ["fu"],
+ "ga": ["ga"],
+ "gai": ["gai"],
+ "gan": ["gan"],
+ "gang": ["gɑŋ"],
+ "gao": ["gaʌ"],
+ "ge": ["gø"],
+ "gei": ["gei"],
+ "gen": ["gœn"],
+ "geng": ["gɵŋ"],
+ "gong": ["goŋ"],
+ "gou": ["gou"],
+ "gu": ["gu"],
+ "gua": ["gua"],
+ "guai": ["guai"],
+ "guan": ["guan"],
+ "guang": ["guɑŋ"],
+ "gui": ["guei"],
+ "gun": ["gun"],
+ "guo": ["guo"],
+ "ha": ["xa"],
+ "hai": ["xai"],
+ "han": ["xan"],
+ "hang": ["xɑŋ"],
+ "hao": ["xaʌ"],
+ "he": ["xø"],
+ "hei": ["xei"],
+ "hen": ["xœn"],
+ "heng": ["xɵŋ"],
+ "hong": ["xoŋ"],
+ "hou": ["xou"],
+ "hu": ["xu"],
+ "hua": ["xua"],
+ "huai": ["xuai"],
+ "huan": ["xuan"],
+ "huang": ["xuɑŋ"],
+ "hui": ["xuei"],
+ "hun": ["xun"],
+ "huo": ["xuo"],
+ "ji": ["dʑi"],
+ "jia": ["dʑia"],
+ "jian": ["dʑiɛn"],
+ "jiang": ["dʑiɑŋ"],
+ "jiao": ["dʑiaʌ"],
+ "jie": ["dʑie"],
+ "jin": ["dʑin"],
+ "jing": ["dʑɨŋ"],
+ "jiong": ["dʑioŋ"],
+ "jiu": ["dʑio"],
+ "ju": ["dʑy"],
+ "juan": ["dʑyɛn"],
+ "jue": ["dʑye"],
+ "jun": ["dʑyn"],
+ "ka": ["ka"],
+ "kai": ["kai"],
+ "kan": ["kan"],
+ "kang": ["kɑŋ"],
+ "kao": ["kaʌ"],
+ "ke": ["kø"],
+ "kei": ["kei"],
+ "ken": ["kœn"],
+ "keng": ["kɵŋ"],
+ "kong": ["koŋ"],
+ "kou": ["kou"],
+ "ku": ["ku"],
+ "kua": ["kua"],
+ "kuai": ["kuai"],
+ "kuan": ["kuan"],
+ "kuang": ["kuɑŋ"],
+ "kui": ["kuei"],
+ "kun": ["kun"],
+ "kuo": ["kuo"],
+ "la": ["la"],
+ "lai": ["lai"],
+ "lan": ["lan"],
+ "lang": ["lɑŋ"],
+ "lao": ["laʌ"],
+ "le": ["lø"],
+ "lei": ["lei"],
+ "leng": ["lɵŋ"],
+ "li": ["li"],
+ "lia": ["lia"],
+ "lian": ["liɛn"],
+ "liang": ["liɑŋ"],
+ "liao": ["liaʌ"],
+ "lie": ["lie"],
+ "lin": ["lin"],
+ "ling": ["lɨŋ"],
+ "liu": ["lio"],
+ "lo": ["lo"],
+ "long": ["loŋ"],
+ "lou": ["lou"],
+ "lu": ["lu"],
+ "lv": ["ly"],
+ "luan": ["luan"],
+ "lve": ["lye"],
+ "lue": ["lue"],
+ "lun": ["lun"],
+ "luo": ["luo"],
+ "ma": ["ma"],
+ "mai": ["mai"],
+ "man": ["man"],
+ "mang": ["mɑŋ"],
+ "mao": ["maʌ"],
+ "me": ["mø"],
+ "mei": ["mei"],
+ "men": ["mœn"],
+ "meng": ["mɵŋ"],
+ "mi": ["mi"],
+ "mian": ["miɛn"],
+ "miao": ["miaʌ"],
+ "mie": ["mie"],
+ "min": ["min"],
+ "ming": ["mɨŋ"],
+ "miu": ["mio"],
+ "mo": ["mo"],
+ "mou": ["mou"],
+ "mu": ["mu"],
+ "na": ["na"],
+ "nai": ["nai"],
+ "nan": ["nan"],
+ "nang": ["nɑŋ"],
+ "nao": ["naʌ"],
+ "ne": ["nø"],
+ "nei": ["nei"],
+ "nen": ["nœn"],
+ "neng": ["nɵŋ"],
+ "ni": ["ni"],
+ "nia": ["nia"],
+ "nian": ["niɛn"],
+ "niang": ["niɑŋ"],
+ "niao": ["niaʌ"],
+ "nie": ["nie"],
+ "nin": ["nin"],
+ "ning": ["nɨŋ"],
+ "niu": ["nio"],
+ "nong": ["noŋ"],
+ "nou": ["nou"],
+ "nu": ["nu"],
+ "nv": ["ny"],
+ "nuan": ["nuan"],
+ "nve": ["nye"],
+ "nue": ["nye"],
+ "nuo": ["nuo"],
+ "o": ["o"],
+ "ou": ["ou"],
+ "pa": ["pa"],
+ "pai": ["pai"],
+ "pan": ["pan"],
+ "pang": ["pɑŋ"],
+ "pao": ["paʌ"],
+ "pe": ["pø"],
+ "pei": ["pei"],
+ "pen": ["pœn"],
+ "peng": ["pɵŋ"],
+ "pi": ["pi"],
+ "pian": ["piɛn"],
+ "piao": ["piaʌ"],
+ "pie": ["pie"],
+ "pin": ["pin"],
+ "ping": ["pɨŋ"],
+ "po": ["po"],
+ "pou": ["pou"],
+ "pu": ["pu"],
+ "qi": ["tɕi"],
+ "qia": ["tɕia"],
+ "qian": ["tɕiɛn"],
+ "qiang": ["tɕiɑŋ"],
+ "qiao": ["tɕiaʌ"],
+ "qie": ["tɕie"],
+ "qin": ["tɕin"],
+ "qing": ["tɕɨŋ"],
+ "qiong": ["tɕioŋ"],
+ "qiu": ["tɕio"],
+ "qu": ["tɕy"],
+ "quan": ["tɕyɛn"],
+ "que": ["tɕye"],
+ "qun": ["tɕyn"],
+ "ran": ["ʐan"],
+ "rang": ["ʐɑŋ"],
+ "rao": ["ʐaʌ"],
+ "re": ["ʐø"],
+ "ren": ["ʐœn"],
+ "reng": ["ʐɵŋ"],
+ "ri": ["ʐʏ"],
+ "rong": ["ʐoŋ"],
+ "rou": ["ʐou"],
+ "ru": ["ʐu"],
+ "rua": ["ʐua"],
+ "ruan": ["ʐuan"],
+ "rui": ["ʐuei"],
+ "run": ["ʐun"],
+ "ruo": ["ʐuo"],
+ "sa": ["sa"],
+ "sai": ["sai"],
+ "san": ["san"],
+ "sang": ["sɑŋ"],
+ "sao": ["saʌ"],
+ "se": ["sø"],
+ "sen": ["sœn"],
+ "seng": ["sɵŋ"],
+ "sha": ["ʂa"],
+ "shai": ["ʂai"],
+ "shan": ["ʂan"],
+ "shang": ["ʂɑŋ"],
+ "shao": ["ʂaʌ"],
+ "she": ["ʂø"],
+ "shei": ["ʂei"],
+ "shen": ["ʂœn"],
+ "sheng": ["ʂɵŋ"],
+ "shi": ["ʂʏ"],
+ "shou": ["ʂou"],
+ "shu": ["ʂu"],
+ "shua": ["ʂua"],
+ "shuai": ["ʂuai"],
+ "shuan": ["ʂuan"],
+ "shuang": ["ʂuɑŋ"],
+ "shui": ["ʂuei"],
+ "shun": ["ʂun"],
+ "shuo": ["ʂuo"],
+ "si": ["sɪ"],
+ "song": ["soŋ"],
+ "sou": ["sou"],
+ "su": ["su"],
+ "suan": ["suan"],
+ "sui": ["suei"],
+ "sun": ["sun"],
+ "suo": ["suo"],
+ "ta": ["ta"],
+ "tai": ["tai"],
+ "tan": ["tan"],
+ "tang": ["tɑŋ"],
+ "tao": ["taʌ"],
+ "te": ["tø"],
+ "tei": ["tei"],
+ "teng": ["tɵŋ"],
+ "ti": ["ti"],
+ "tian": ["tiɛn"],
+ "tiao": ["tiaʌ"],
+ "tie": ["tie"],
+ "ting": ["tɨŋ"],
+ "tong": ["toŋ"],
+ "tou": ["tou"],
+ "tu": ["tu"],
+ "tuan": ["tuan"],
+ "tui": ["tuei"],
+ "tun": ["tun"],
+ "tuo": ["tuo"],
+ "wa": ["wa"],
+ "wai": ["wai"],
+ "wan": ["wan"],
+ "wang": ["wɑŋ"],
+ "wei": ["wei"],
+ "wen": ["wœn"],
+ "weng": ["wɵŋ"],
+ "wo": ["wo"],
+ "wu": ["wu"],
+ "xi": ["ɕi"],
+ "xia": ["ɕia"],
+ "xian": ["ɕiɛn"],
+ "xiang": ["ɕiɑŋ"],
+ "xiao": ["ɕiaʌ"],
+ "xie": ["ɕie"],
+ "xin": ["ɕin"],
+ "xing": ["ɕɨŋ"],
+ "xiong": ["ɕioŋ"],
+ "xiu": ["ɕio"],
+ "xu": ["ɕy"],
+ "xuan": ["ɕyɛn"],
+ "xue": ["ɕye"],
+ "xun": ["ɕyn"],
+ "ya": ["ia"],
+ "yan": ["iɛn"],
+ "yang": ["iɑŋ"],
+ "yao": ["iaʌ"],
+ "ye": ["ie"],
+ "yi": ["i"],
+ "yin": ["in"],
+ "ying": ["ɨŋ"],
+ "yo": ["io"],
+ "yong": ["ioŋ"],
+ "you": ["io"],
+ "yu": ["y"],
+ "yuan": ["yɛn"],
+ "yue": ["ye"],
+ "yun": ["yn"],
+ "za": ["dza"],
+ "zai": ["dzai"],
+ "zan": ["dzan"],
+ "zang": ["dzɑŋ"],
+ "zao": ["dzaʌ"],
+ "ze": ["dzø"],
+ "zei": ["dzei"],
+ "zen": ["dzœn"],
+ "zeng": ["dzɵŋ"],
+ "zha": ["dʒa"],
+ "zhai": ["dʒai"],
+ "zhan": ["dʒan"],
+ "zhang": ["dʒɑŋ"],
+ "zhao": ["dʒaʌ"],
+ "zhe": ["dʒø"],
+ # "zhei": ["dʒei"], it doesn't exist
+ "zhen": ["dʒœn"],
+ "zheng": ["dʒɵŋ"],
+ "zhi": ["dʒʏ"],
+ "zhong": ["dʒoŋ"],
+ "zhou": ["dʒou"],
+ "zhu": ["dʒu"],
+ "zhua": ["dʒua"],
+ "zhuai": ["dʒuai"],
+ "zhuan": ["dʒuan"],
+ "zhuang": ["dʒuɑŋ"],
+ "zhui": ["dʒuei"],
+ "zhun": ["dʒun"],
+ "zhuo": ["dʒuo"],
+ "zi": ["dzɪ"],
+ "zong": ["dzoŋ"],
+ "zou": ["dzou"],
+ "zu": ["dzu"],
+ "zuan": ["dzuan"],
+ "zui": ["dzuei"],
+ "zun": ["dzun"],
+ "zuo": ["dzuo"],
+}
\ No newline at end of file
diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index be587211..adbd0d20 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -219,7 +219,7 @@ def synthesis(model,
ap (TTS.tts.utils.audio.AudioProcessor): audio processor to process
model outputs.
speaker_id (int): id of speaker
- style_wav (str): Uses for style embedding of GST.
+ style_wav (str | Dict[str, float]): Uses for style embedding of GST.
truncated (bool): keep model states after inference. It can be used
for continuous inference at long texts.
enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence.
diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py
index 9771e691..4f4a38ea 100644
--- a/TTS/tts/utils/text/__init__.py
+++ b/TTS/tts/utils/text/__init__.py
@@ -8,6 +8,7 @@ from phonemizer.phonemize import phonemize
from TTS.tts.utils.text import cleaners
from TTS.tts.utils.text.symbols import (_bos, _eos, _punctuations,
make_symbols, phonemes, symbols)
+from TTS.tts.utils.chinese_mandarin.phonemizer import chinese_text_to_phonemes
# pylint: disable=unnecessary-comprehension
@@ -28,9 +29,23 @@ PHONEME_PUNCTUATION_PATTERN = r'['+_punctuations.replace(' ', '')+']+'
def text2phone(text, language):
+ '''Convert graphemes to phonemes. For most of the languages, it calls
+ the phonemizer python library that calls espeak/espeak-ng. For chinese
+ mandarin, it calls pypinyin + custom function for phonemizing
+ Parameters:
+ text (str): text to phonemize
+ language (str): language of the text
+ Returns:
+ ph (str): phonemes as a string seperated by "|"
+ ph = "ɪ|g|ˈ|z|æ|m|p|ə|l"
'''
- Convert graphemes to phonemes.
- '''
+
+ # TO REVIEW : How to have a good implementation for this?
+ if language == "chinese-mandarin":
+ ph = chinese_text_to_phonemes(text)
+ return ph
+
+
seperator = phonemizer.separator.Separator(' |', '', '|')
#try:
punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text)
diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py
index 7c3f1017..49a25557 100644
--- a/TTS/tts/utils/text/cleaners.py
+++ b/TTS/tts/utils/text/cleaners.py
@@ -15,6 +15,8 @@ from unidecode import unidecode
from .number_norm import normalize_numbers
from .abbreviations import abbreviations_en, abbreviations_fr
from .time import expand_time_english
+from TTS.tts.utils.chinese_mandarin.numbers import replace_numbers_to_characters_in_text
+
# Regular expression matching whitespace:
_whitespace_re = re.compile(r'\s+')
@@ -122,6 +124,13 @@ def portuguese_cleaners(text):
text = collapse_whitespace(text)
return text
+def chinese_mandarin_cleaners(text: str) -> str:
+ '''Basic pipeline for chinese'''
+ text = replace_numbers_to_characters_in_text(text)
+ return text
+
+
+
def phoneme_cleaners(text):
'''Pipeline for phonemes mode, including number and abbreviation expansion.'''
text = expand_numbers(text)
diff --git a/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb b/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb
new file mode 100644
index 00000000..709dbb8d
--- /dev/null
+++ b/notebooks/Chinese_Mandarin_DDC_GST_Tacotron2_TTS_and_MultiBand_MelGAN_Example.ipynb
@@ -0,0 +1,529 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "6LWsNd3_M3MP"
+ },
+ "source": [
+ "# Mozilla TTS on CPU Real-Time Speech Synthesis "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "FAqrSIWgLyP0"
+ },
+ "source": [
+ "We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n",
+ "\n",
+ "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n",
+ "\n",
+ "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n",
+ "\n",
+ "Note that both model performances can be improved with more training."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "Ku-dA4DKoeXk"
+ },
+ "source": [
+ "### Download Models"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 162
+ },
+ "colab_type": "code",
+ "id": "jGIgnWhGsxU1",
+ "outputId": "88725e41-a8dc-4885-b3bf-cac939f38abe",
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "mkdir: cannot create directory 'data/': File exists\n",
+ "Downloading...\n",
+ "From: https://drive.google.com/uc?id=1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV\n",
+ "To: /root/projects/speech/mozilla-TTS_dev/notebooks/data/tts_scale_stats.npy\n",
+ "100%|██████████████████████████████████████| 10.5k/10.5k [00:00<00:00, 18.1MB/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "! mkdir data/\n",
+ "! gdown --id 1RR9rZdV_FMm8yvtCHALtUbJf1nxbUiAw -O data/tts_model.pth.tar\n",
+ "! gdown --id 1daY1JHGXEozJ-MGYLiWEUmzEwEvM5xpz -O data/tts_config.json\n",
+ "! gdown --id 1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV -O data/tts_scale_stats.npy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 235
+ },
+ "colab_type": "code",
+ "id": "4dnpE0-kvTsu",
+ "outputId": "76377c6d-789c-4995-ba00-a21a6e1c401e",
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Downloading...\n",
+ "From: https://drive.google.com/uc?id=11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU\n",
+ "To: /root/projects/speech/mozilla-TTS_dev/notebooks/data/vocoder_scale_stats.npy\n",
+ "100%|██████████████████████████████████████| 10.5k/10.5k [00:00<00:00, 16.7MB/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "! gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O data/vocoder_model.pth.tar\n",
+ "! gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/vocoder_config.json\n",
+ "! gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/vocoder_scale_stats.npy"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "Zlgi8fPdpRF0"
+ },
+ "source": [
+ "### Define TTS function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "f-Yc42nQZG5A"
+ },
+ "outputs": [],
+ "source": [
+ "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True, style_wav=None):\n",
+ " t_1 = time.time()\n",
+ " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=style_wav,\n",
+ " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n",
+ " # mel_postnet_spec = ap.denormalize(mel_postnet_spec.T)\n",
+ " if not use_gl:\n",
+ " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n",
+ " waveform = waveform.flatten()\n",
+ " if use_cuda:\n",
+ " waveform = waveform.cpu()\n",
+ " waveform = waveform.numpy()\n",
+ " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
+ " tps = (time.time() - t_1) / len(waveform)\n",
+ " print(waveform.shape)\n",
+ " print(\" > Run-time: {}\".format(time.time() - t_1))\n",
+ " print(\" > Real-time factor: {}\".format(rtf))\n",
+ " print(\" > Time per step: {}\".format(tps))\n",
+ " IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n",
+ " return alignment, mel_postnet_spec, stop_tokens, waveform"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "ZksegYQepkFg"
+ },
+ "source": [
+ "### Load Models"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "oVa0kOamprgj"
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import torch\n",
+ "import time\n",
+ "import IPython\n",
+ "\n",
+ "from TTS.tts.utils.generic_utils import setup_model\n",
+ "from TTS.utils.io import load_config\n",
+ "from TTS.tts.utils.text.symbols import symbols, phonemes, make_symbols\n",
+ "from TTS.utils.audio import AudioProcessor\n",
+ "from TTS.tts.utils.synthesis import synthesis"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "EY-sHVO8IFSH"
+ },
+ "outputs": [],
+ "source": [
+ "# runtime settings\n",
+ "use_cuda = False"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "_1aIUp2FpxOQ"
+ },
+ "outputs": [],
+ "source": [
+ "# model paths\n",
+ "TTS_MODEL = \"/tank/models/tts/mozilla-TTS/tacotron2-DCC/chinese_mandarin/mandarin_dca_attn_gst_dcc-February-12-2021_03+13PM-5dbb48d/checkpoint_17000.pth.tar\"\n",
+ "TTS_CONFIG = \"/tank/models/tts/mozilla-TTS/tacotron2-DCC/chinese_mandarin/mandarin_dca_attn_gst_dcc-February-12-2021_03+13PM-5dbb48d/config.json\"\n",
+ "\n",
+ "TTS_MODEL = \"data/tts_model.pth.tar\"\n",
+ "TTS_CONFIG = \"data/tts_config.json\"\n",
+ "\n",
+ "VOCODER_MODEL = \"/root/.local/share/tts/vocoder_models--en--ljspeech--mulitband-melgan/model_file.pth.tar\"\n",
+ "VOCODER_CONFIG = \"/root/.local/share/tts/vocoder_models--en--ljspeech--mulitband-melgan/config.json\"\n",
+ "\n",
+ "VOCODER_MODEL = \"data/vocoder_model.pth.tar\"\n",
+ "VOCODER_CONFIG = \"data/vocoder_config.json\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "CpgmdBVQplbv"
+ },
+ "outputs": [],
+ "source": [
+ "# load configs\n",
+ "TTS_CONFIG = load_config(TTS_CONFIG)\n",
+ "VOCODER_CONFIG = load_config(VOCODER_CONFIG)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 471
+ },
+ "colab_type": "code",
+ "id": "zmrQxiozIUVE",
+ "outputId": "60c4daa0-4c5b-4a2e-fe0d-be437d003a49",
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Setting up Audio Processor...\n",
+ " | > sample_rate:22050\n",
+ " | > resample:False\n",
+ " | > num_mels:80\n",
+ " | > min_level_db:-100\n",
+ " | > frame_shift_ms:None\n",
+ " | > frame_length_ms:None\n",
+ " | > ref_level_db:0\n",
+ " | > fft_size:1024\n",
+ " | > power:1.5\n",
+ " | > preemphasis:0.0\n",
+ " | > griffin_lim_iters:60\n",
+ " | > signal_norm:True\n",
+ " | > symmetric_norm:True\n",
+ " | > mel_fmin:50.0\n",
+ " | > mel_fmax:7600.0\n",
+ " | > spec_gain:1.0\n",
+ " | > stft_pad_mode:reflect\n",
+ " | > max_norm:4.0\n",
+ " | > clip_norm:True\n",
+ " | > do_trim_silence:True\n",
+ " | > trim_db:60\n",
+ " | > do_sound_norm:False\n",
+ " | > stats_path:data/tts_scale_stats.npy\n",
+ " | > hop_length:256\n",
+ " | > win_length:1024\n"
+ ]
+ }
+ ],
+ "source": [
+ "# load the audio processor\n",
+ "TTS_CONFIG.audio['stats_path'] = 'data/tts_scale_stats.npy'\n",
+ "ap = AudioProcessor(**TTS_CONFIG.audio) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 35
+ },
+ "colab_type": "code",
+ "id": "8fLoI4ipqMeS",
+ "outputId": "b789066e-e305-42ad-b3ca-eba8d9267382",
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Using model: tacotron2\n"
+ ]
+ }
+ ],
+ "source": [
+ "# LOAD TTS MODEL\n",
+ "# multi speaker \n",
+ "speaker_id = None\n",
+ "speakers = []\n",
+ "\n",
+ "# load the model (chinese_mandarin special characters/punctuations are in the tts_config.json)\n",
+ "if TTS_CONFIG.get(\"characters\"):\n",
+ " _characters = TTS_CONFIG[\"characters\"][\"characters\"]\n",
+ " _phonemes = TTS_CONFIG[\"characters\"][\"phonemes\"]\n",
+ " _punctuations = TTS_CONFIG[\"characters\"][\"punctuations\"]\n",
+ " _pad = TTS_CONFIG[\"characters\"][\"pad\"]\n",
+ " _eos = TTS_CONFIG[\"characters\"][\"eos\"]\n",
+ " _bos = TTS_CONFIG[\"characters\"][\"bos\"]\n",
+ " \n",
+ " symbols, phonemes = make_symbols(_characters, _phonemes, punctuations= _punctuations, pad=_pad, eos=_eos, bos=_bos )\n",
+ "\n",
+ "num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)\n",
+ "model = setup_model(num_chars, len(speakers), TTS_CONFIG)\n",
+ "\n",
+ "# load model state\n",
+ "cp = torch.load(TTS_MODEL, map_location=torch.device('cpu'))\n",
+ "\n",
+ "# load the model\n",
+ "model.load_state_dict(cp['model'])\n",
+ "if use_cuda:\n",
+ " model.cuda()\n",
+ "model.eval()\n",
+ "\n",
+ "# set model stepsize\n",
+ "if 'r' in cp:\n",
+ " model.decoder.set_r(cp['r'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
+ },
+ "colab_type": "code",
+ "id": "zKoq0GgzqzhQ",
+ "outputId": "234efc61-f37a-40bc-95a3-b51896018ccb",
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Generator Model: multiband_melgan_generator\n",
+ " > Setting up Audio Processor...\n",
+ " | > sample_rate:22050\n",
+ " | > resample:False\n",
+ " | > num_mels:80\n",
+ " | > min_level_db:-100\n",
+ " | > frame_shift_ms:None\n",
+ " | > frame_length_ms:None\n",
+ " | > ref_level_db:0\n",
+ " | > fft_size:1024\n",
+ " | > power:None\n",
+ " | > preemphasis:0.0\n",
+ " | > griffin_lim_iters:None\n",
+ " | > signal_norm:True\n",
+ " | > symmetric_norm:True\n",
+ " | > mel_fmin:50.0\n",
+ " | > mel_fmax:7600.0\n",
+ " | > spec_gain:1.0\n",
+ " | > stft_pad_mode:reflect\n",
+ " | > max_norm:4.0\n",
+ " | > clip_norm:True\n",
+ " | > do_trim_silence:True\n",
+ " | > trim_db:60\n",
+ " | > do_sound_norm:False\n",
+ " | > stats_path:data/vocoder_scale_stats.npy\n",
+ " | > hop_length:256\n",
+ " | > win_length:1024\n",
+ "\n",
+ "Vocoder loaded\n"
+ ]
+ }
+ ],
+ "source": [
+ "from TTS.vocoder.utils.generic_utils import setup_generator\n",
+ "\n",
+ "# LOAD VOCODER MODEL\n",
+ "vocoder_model = setup_generator(VOCODER_CONFIG)\n",
+ "vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location=\"cpu\")[\"model\"])\n",
+ "vocoder_model.remove_weight_norm()\n",
+ "vocoder_model.inference_padding = 0\n",
+ "\n",
+ "\n",
+ "VOCODER_CONFIG.audio['stats_path'] = 'data/vocoder_scale_stats.npy'\n",
+ "ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n",
+ "if use_cuda:\n",
+ " vocoder_model.cuda()\n",
+ "vocoder_model.eval()\n",
+ "print(\"\\nVocoder loaded\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "Ws_YkPKsLgo-"
+ },
+ "source": [
+ "## Run Inference"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Here some test sentences for you to play with :\n",
+ "sentence = \"我从来不会说很标准的中文。\"\n",
+ "sentence = \"我喜欢听人工智能的博客。\"\n",
+ "sentence = \"我来自一个法国郊区的地方。\"\n",
+ "sentence = \"不比不知道,一比吓一跳!\"\n",
+ "sentence = \"台湾是一个真的很好玩的地方!\"\n",
+ "sentence = \"干一行,行一行,行行都行。\"\n",
+ "sentence = \"我要盖被子,好尴尬!\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# You can also play with the style_wav global style token. However, the lady speaking in the baker dataset\n",
+ "# has no emotion through all the sentences. It's hard to get some nice GST with this.\n",
+ "# That's also why adding \"!\" or \"?\" at the end of sentence change nothing. The dataset has no such prosody.\n",
+ "style_wav = {\"2\": 0.3, \"1\": -0.1}\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 134
+ },
+ "colab_type": "code",
+ "id": "FuWxZ9Ey5Puj",
+ "outputId": "9c06adad-5451-4393-89a1-a2e7dc39ab91",
+ "tags": []
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(50688,)\n",
+ " > Run-time: 1.5945854187011719\n",
+ " > Real-time factor: 0.6935317513786934\n",
+ " > Time per step: 3.145291761617468e-05\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sentence = \"我喜欢听人工智能的博客。\"\n",
+ "style_wav = {\"2\": 0.2, \"7\": -0.1}\n",
+ "\n",
+ "align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True, style_wav= style_wav)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "collapsed_sections": [],
+ "name": "DDC-TTS_and_MultiBand-MelGAN_Example.ipynb",
+ "provenance": [],
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/requirements.txt b/requirements.txt
index 7a0d9f76..659fe787 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,8 @@ numba==0.48
librosa==0.7.2
phonemizer>=2.2.0
unidecode==0.4.20
+pypinyin
+jieba
tensorboardX
matplotlib
Pillow