mirror of https://github.com/coqui-ai/TTS.git
<add> Chinese mandarin implementation (tacotron2)
This commit is contained in:
parent
eb543c027e
commit
c4c7bc1b88
|
@ -75,6 +75,16 @@
|
|||
"contact":"erengolge@gmail.com"
|
||||
}
|
||||
}
|
||||
},
|
||||
"zh":{
|
||||
"baker":{
|
||||
"tacotron2-DDC-GST":{
|
||||
"model_file": "1RR9rZdV_FMm8yvtCHALtUbJf1nxbUiAw",
|
||||
"config_file": "1daY1JHGXEozJ-MGYLiWEUmzEwEvM5xpz",
|
||||
"stats_file": "1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV",
|
||||
"commit": ""
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"vocoder_models":{
|
||||
|
|
|
@ -352,3 +352,19 @@ def _voxcel_x(root_path, meta_file, voxcel_idx):
|
|||
|
||||
with open(str(cache_to), 'r') as f:
|
||||
return [x.strip().split('|') for x in f.readlines()]
|
||||
|
||||
|
||||
|
||||
|
||||
# ======================================== Baker (chinese mandarin single speaker) ===========================================
|
||||
def baker(root_path, meta_file):
|
||||
"""Normalizes the Baker meta data file to TTS format"""
|
||||
txt_file = os.path.join(root_path, meta_file)
|
||||
items = []
|
||||
speaker_name = "baker"
|
||||
with open(txt_file, 'r') as ttf:
|
||||
for line in ttf:
|
||||
wav_name, text = line.rstrip('\n').split("|")
|
||||
wav_path = os.path.join(root_path, "clips_22", wav_name)
|
||||
items.append([text, wav_path, speaker_name])
|
||||
return items
|
||||
|
|
|
@ -0,0 +1,107 @@
|
|||
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Licensed under WTFPL or the Unlicense or CC0.
|
||||
# This uses Python 3, but it's easy to port to Python 2 by changing
|
||||
# strings to u'xx'.
|
||||
|
||||
import re
|
||||
import itertools
|
||||
|
||||
|
||||
def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False):
|
||||
"""
|
||||
Converts numbers to Chinese representations.
|
||||
`big` : use financial characters.
|
||||
`simp` : use simplified characters instead of traditional characters.
|
||||
`o` : use 〇 for zero.
|
||||
`twoalt`: use 两/兩 for two when appropriate.
|
||||
Note that `o` and `twoalt` is ignored when `big` is used,
|
||||
and `twoalt` is ignored when `o` is used for formal representations.
|
||||
"""
|
||||
# check num first
|
||||
nd = str(num)
|
||||
if abs(float(nd)) >= 1e48:
|
||||
raise ValueError('number out of range')
|
||||
elif 'e' in nd:
|
||||
raise ValueError('scientific notation is not supported')
|
||||
c_symbol = '正负点' if simp else '正負點'
|
||||
if o: # formal
|
||||
twoalt = False
|
||||
if big:
|
||||
c_basic = '零壹贰叁肆伍陆柒捌玖' if simp else '零壹貳參肆伍陸柒捌玖'
|
||||
c_unit1 = '拾佰仟'
|
||||
c_twoalt = '贰' if simp else '貳'
|
||||
else:
|
||||
c_basic = '〇一二三四五六七八九' if o else '零一二三四五六七八九'
|
||||
c_unit1 = '十百千'
|
||||
if twoalt:
|
||||
c_twoalt = '两' if simp else '兩'
|
||||
else:
|
||||
c_twoalt = '二'
|
||||
c_unit2 = '万亿兆京垓秭穰沟涧正载' if simp else '萬億兆京垓秭穰溝澗正載'
|
||||
revuniq = lambda l: ''.join(k for k, g in itertools.groupby(reversed(l)))
|
||||
nd = str(num)
|
||||
result = []
|
||||
if nd[0] == '+':
|
||||
result.append(c_symbol[0])
|
||||
elif nd[0] == '-':
|
||||
result.append(c_symbol[1])
|
||||
if '.' in nd:
|
||||
integer, remainder = nd.lstrip('+-').split('.')
|
||||
else:
|
||||
integer, remainder = nd.lstrip('+-'), None
|
||||
if int(integer):
|
||||
splitted = [integer[max(i - 4, 0):i]
|
||||
for i in range(len(integer), 0, -4)]
|
||||
intresult = []
|
||||
for nu, unit in enumerate(splitted):
|
||||
# special cases
|
||||
if int(unit) == 0: # 0000
|
||||
intresult.append(c_basic[0])
|
||||
continue
|
||||
elif nu > 0 and int(unit) == 2: # 0002
|
||||
intresult.append(c_twoalt + c_unit2[nu - 1])
|
||||
continue
|
||||
ulist = []
|
||||
unit = unit.zfill(4)
|
||||
for nc, ch in enumerate(reversed(unit)):
|
||||
if ch == '0':
|
||||
if ulist: # ???0
|
||||
ulist.append(c_basic[0])
|
||||
elif nc == 0:
|
||||
ulist.append(c_basic[int(ch)])
|
||||
elif nc == 1 and ch == '1' and unit[1] == '0':
|
||||
# special case for tens
|
||||
# edit the 'elif' if you don't like
|
||||
# 十四, 三千零十四, 三千三百一十四
|
||||
ulist.append(c_unit1[0])
|
||||
elif nc > 1 and ch == '2':
|
||||
ulist.append(c_twoalt + c_unit1[nc - 1])
|
||||
else:
|
||||
ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
|
||||
ustr = revuniq(ulist)
|
||||
if nu == 0:
|
||||
intresult.append(ustr)
|
||||
else:
|
||||
intresult.append(ustr + c_unit2[nu - 1])
|
||||
result.append(revuniq(intresult).strip(c_basic[0]))
|
||||
else:
|
||||
result.append(c_basic[0])
|
||||
if remainder:
|
||||
result.append(c_symbol[2])
|
||||
result.append(''.join(c_basic[int(ch)] for ch in remainder))
|
||||
return ''.join(result)
|
||||
|
||||
|
||||
|
||||
|
||||
def _number_replace(match : re.Match):
|
||||
match_str: str = match.group()
|
||||
return _num2chinese(match_str)
|
||||
|
||||
|
||||
def replace_numbers_to_characters_in_text(text : str):
|
||||
text = re.sub(r'[0-9]+', _number_replace, text)
|
||||
return text
|
|
@ -0,0 +1,41 @@
|
|||
from typing import List
|
||||
|
||||
import pypinyin
|
||||
|
||||
from .pinyinToPhonemes import PINYIN_DICT
|
||||
|
||||
|
||||
import jieba
|
||||
|
||||
|
||||
def _chinese_character_to_pinyin(text: str) -> List[str]:
|
||||
pinyins = pypinyin.pinyin(
|
||||
text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True
|
||||
)
|
||||
pinyins_flat_list = [item for sublist in pinyins for item in sublist]
|
||||
return pinyins_flat_list
|
||||
|
||||
|
||||
def _chinese_pinyin_to_phoneme(pinyin: str) -> str:
|
||||
segment = pinyin[:-1]
|
||||
tone = pinyin[-1]
|
||||
phoneme = PINYIN_DICT.get(segment, [""])[0]
|
||||
return phoneme + tone
|
||||
|
||||
|
||||
def chinese_text_to_phonemes(text: str) -> str:
|
||||
tokenized_text = jieba.cut(text, HMM=False)
|
||||
tokenized_text = " ".join(tokenized_text)
|
||||
pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text)
|
||||
|
||||
results: List[str] = []
|
||||
|
||||
for token in pinyined_text:
|
||||
if token[-1] in "12345": # TODO transform to is_pinyin()
|
||||
pinyin_phonemes = _chinese_pinyin_to_phoneme(token)
|
||||
|
||||
results += list(pinyin_phonemes)
|
||||
else: # is ponctuation or other
|
||||
results += list(token)
|
||||
|
||||
return "|".join(results)
|
|
@ -0,0 +1,420 @@
|
|||
|
||||
PINYIN_DICT = {
|
||||
"a": ["a"],
|
||||
"ai": ["ai"],
|
||||
"an": ["an"],
|
||||
"ang": ["ɑŋ"],
|
||||
"ao": ["aʌ"],
|
||||
"ba": ["ba"],
|
||||
"bai": ["bai"],
|
||||
"ban": ["ban"],
|
||||
"bang": ["bɑŋ"],
|
||||
"bao": ["baʌ"],
|
||||
# "be": ["be"], doesnt exist
|
||||
"bei": ["bɛi"],
|
||||
"ben": ["bœn"],
|
||||
"beng": ["bɵŋ"],
|
||||
"bi": ["bi"],
|
||||
"bian": ["biɛn"],
|
||||
"biao": ["biaʌ"],
|
||||
"bie": ["bie"],
|
||||
"bin": ["bin"],
|
||||
"bing": ["bɨŋ"],
|
||||
"bo": ["bo"],
|
||||
"bu": ["bu"],
|
||||
"ca": ["tsa"],
|
||||
"cai": ["tsai"],
|
||||
"can": ["tsan"],
|
||||
"cang": ["tsɑŋ"],
|
||||
"cao": ["tsaʌ"],
|
||||
"ce": ["tsø"],
|
||||
"cen": ["tsœn"],
|
||||
"ceng": ["tsɵŋ"],
|
||||
"cha": ["ʈʂa"],
|
||||
"chai": ["ʈʂai"],
|
||||
"chan": ["ʈʂan"],
|
||||
"chang": ["ʈʂɑŋ"],
|
||||
"chao": ["ʈʂaʌ"],
|
||||
"che": ["ʈʂø"],
|
||||
"chen": ["ʈʂœn"],
|
||||
"cheng": ["ʈʂɵŋ"],
|
||||
"chi": ["ʈʂʏ"],
|
||||
"chong": ["ʈʂoŋ"],
|
||||
"chou": ["ʈʂou"],
|
||||
"chu": ["ʈʂu"],
|
||||
"chua": ["ʈʂua"],
|
||||
"chuai": ["ʈʂuai"],
|
||||
"chuan": ["ʈʂuan"],
|
||||
"chuang": ["ʈʂuɑŋ"],
|
||||
"chui": ["ʈʂuei"],
|
||||
"chun": ["ʈʂun"],
|
||||
"chuo": ["ʈʂuo"],
|
||||
"ci": ["tsɪ"],
|
||||
"cong": ["tsoŋ"],
|
||||
"cou": ["tsou"],
|
||||
"cu": ["tsu"],
|
||||
"cuan": ["tsuan"],
|
||||
"cui": ["tsuei"],
|
||||
"cun": ["tsun"],
|
||||
"cuo": ["tsuo"],
|
||||
"da": ["da"],
|
||||
"dai": ["dai"],
|
||||
"dan": ["dan"],
|
||||
"dang": ["dɑŋ"],
|
||||
"dao": ["daʌ"],
|
||||
"de": ["dø"],
|
||||
"dei": ["dei"],
|
||||
# "den": ["dœn"],
|
||||
"deng": ["dɵŋ"],
|
||||
"di": ["di"],
|
||||
"dia": ["dia"],
|
||||
"dian": ["diɛn"],
|
||||
"diao": ["diaʌ"],
|
||||
"die": ["die"],
|
||||
"ding": ["dɨŋ"],
|
||||
"diu": ["dio"],
|
||||
"dong": ["doŋ"],
|
||||
"dou": ["dou"],
|
||||
"du": ["du"],
|
||||
"duan": ["duan"],
|
||||
"dui": ["duei"],
|
||||
"dun": ["dun"],
|
||||
"duo": ["duo"],
|
||||
"e": ["ø"],
|
||||
"ei": ["ei"],
|
||||
"en": ["œn"],
|
||||
# "ng": ["œn"],
|
||||
# "eng": ["ɵŋ"],
|
||||
"er": ["er"],
|
||||
"fa": ["fa"],
|
||||
"fan": ["fan"],
|
||||
"fang": ["fɑŋ"],
|
||||
"fei": ["fei"],
|
||||
"fen": ["fœn"],
|
||||
"feng": ["fɵŋ"],
|
||||
"fo": ["fo"],
|
||||
"fou": ["fou"],
|
||||
"fu": ["fu"],
|
||||
"ga": ["ga"],
|
||||
"gai": ["gai"],
|
||||
"gan": ["gan"],
|
||||
"gang": ["gɑŋ"],
|
||||
"gao": ["gaʌ"],
|
||||
"ge": ["gø"],
|
||||
"gei": ["gei"],
|
||||
"gen": ["gœn"],
|
||||
"geng": ["gɵŋ"],
|
||||
"gong": ["goŋ"],
|
||||
"gou": ["gou"],
|
||||
"gu": ["gu"],
|
||||
"gua": ["gua"],
|
||||
"guai": ["guai"],
|
||||
"guan": ["guan"],
|
||||
"guang": ["guɑŋ"],
|
||||
"gui": ["guei"],
|
||||
"gun": ["gun"],
|
||||
"guo": ["guo"],
|
||||
"ha": ["xa"],
|
||||
"hai": ["xai"],
|
||||
"han": ["xan"],
|
||||
"hang": ["xɑŋ"],
|
||||
"hao": ["xaʌ"],
|
||||
"he": ["xø"],
|
||||
"hei": ["xei"],
|
||||
"hen": ["xœn"],
|
||||
"heng": ["xɵŋ"],
|
||||
"hong": ["xoŋ"],
|
||||
"hou": ["xou"],
|
||||
"hu": ["xu"],
|
||||
"hua": ["xua"],
|
||||
"huai": ["xuai"],
|
||||
"huan": ["xuan"],
|
||||
"huang": ["xuɑŋ"],
|
||||
"hui": ["xuei"],
|
||||
"hun": ["xun"],
|
||||
"huo": ["xuo"],
|
||||
"ji": ["dʑi"],
|
||||
"jia": ["dʑia"],
|
||||
"jian": ["dʑiɛn"],
|
||||
"jiang": ["dʑiɑŋ"],
|
||||
"jiao": ["dʑiaʌ"],
|
||||
"jie": ["dʑie"],
|
||||
"jin": ["dʑin"],
|
||||
"jing": ["dʑɨŋ"],
|
||||
"jiong": ["dʑioŋ"],
|
||||
"jiu": ["dʑio"],
|
||||
"ju": ["dʑy"],
|
||||
"juan": ["dʑyɛn"],
|
||||
"jue": ["dʑye"],
|
||||
"jun": ["dʑyn"],
|
||||
"ka": ["ka"],
|
||||
"kai": ["kai"],
|
||||
"kan": ["kan"],
|
||||
"kang": ["kɑŋ"],
|
||||
"kao": ["kaʌ"],
|
||||
"ke": ["kø"],
|
||||
"kei": ["kei"],
|
||||
"ken": ["kœn"],
|
||||
"keng": ["kɵŋ"],
|
||||
"kong": ["koŋ"],
|
||||
"kou": ["kou"],
|
||||
"ku": ["ku"],
|
||||
"kua": ["kua"],
|
||||
"kuai": ["kuai"],
|
||||
"kuan": ["kuan"],
|
||||
"kuang": ["kuɑŋ"],
|
||||
"kui": ["kuei"],
|
||||
"kun": ["kun"],
|
||||
"kuo": ["kuo"],
|
||||
"la": ["la"],
|
||||
"lai": ["lai"],
|
||||
"lan": ["lan"],
|
||||
"lang": ["lɑŋ"],
|
||||
"lao": ["laʌ"],
|
||||
"le": ["lø"],
|
||||
"lei": ["lei"],
|
||||
"leng": ["lɵŋ"],
|
||||
"li": ["li"],
|
||||
"lia": ["lia"],
|
||||
"lian": ["liɛn"],
|
||||
"liang": ["liɑŋ"],
|
||||
"liao": ["liaʌ"],
|
||||
"lie": ["lie"],
|
||||
"lin": ["lin"],
|
||||
"ling": ["lɨŋ"],
|
||||
"liu": ["lio"],
|
||||
"lo": ["lo"],
|
||||
"long": ["loŋ"],
|
||||
"lou": ["lou"],
|
||||
"lu": ["lu"],
|
||||
"lv": ["ly"],
|
||||
"luan": ["luan"],
|
||||
"lve": ["lye"],
|
||||
"lue": ["lue"],
|
||||
"lun": ["lun"],
|
||||
"luo": ["luo"],
|
||||
"ma": ["ma"],
|
||||
"mai": ["mai"],
|
||||
"man": ["man"],
|
||||
"mang": ["mɑŋ"],
|
||||
"mao": ["maʌ"],
|
||||
"me": ["mø"],
|
||||
"mei": ["mei"],
|
||||
"men": ["mœn"],
|
||||
"meng": ["mɵŋ"],
|
||||
"mi": ["mi"],
|
||||
"mian": ["miɛn"],
|
||||
"miao": ["miaʌ"],
|
||||
"mie": ["mie"],
|
||||
"min": ["min"],
|
||||
"ming": ["mɨŋ"],
|
||||
"miu": ["mio"],
|
||||
"mo": ["mo"],
|
||||
"mou": ["mou"],
|
||||
"mu": ["mu"],
|
||||
"na": ["na"],
|
||||
"nai": ["nai"],
|
||||
"nan": ["nan"],
|
||||
"nang": ["nɑŋ"],
|
||||
"nao": ["naʌ"],
|
||||
"ne": ["nø"],
|
||||
"nei": ["nei"],
|
||||
"nen": ["nœn"],
|
||||
"neng": ["nɵŋ"],
|
||||
"ni": ["ni"],
|
||||
"nia": ["nia"],
|
||||
"nian": ["niɛn"],
|
||||
"niang": ["niɑŋ"],
|
||||
"niao": ["niaʌ"],
|
||||
"nie": ["nie"],
|
||||
"nin": ["nin"],
|
||||
"ning": ["nɨŋ"],
|
||||
"niu": ["nio"],
|
||||
"nong": ["noŋ"],
|
||||
"nou": ["nou"],
|
||||
"nu": ["nu"],
|
||||
"nv": ["ny"],
|
||||
"nuan": ["nuan"],
|
||||
"nve": ["nye"],
|
||||
"nue": ["nye"],
|
||||
"nuo": ["nuo"],
|
||||
"o": ["o"],
|
||||
"ou": ["ou"],
|
||||
"pa": ["pa"],
|
||||
"pai": ["pai"],
|
||||
"pan": ["pan"],
|
||||
"pang": ["pɑŋ"],
|
||||
"pao": ["paʌ"],
|
||||
"pe": ["pø"],
|
||||
"pei": ["pei"],
|
||||
"pen": ["pœn"],
|
||||
"peng": ["pɵŋ"],
|
||||
"pi": ["pi"],
|
||||
"pian": ["piɛn"],
|
||||
"piao": ["piaʌ"],
|
||||
"pie": ["pie"],
|
||||
"pin": ["pin"],
|
||||
"ping": ["pɨŋ"],
|
||||
"po": ["po"],
|
||||
"pou": ["pou"],
|
||||
"pu": ["pu"],
|
||||
"qi": ["tɕi"],
|
||||
"qia": ["tɕia"],
|
||||
"qian": ["tɕiɛn"],
|
||||
"qiang": ["tɕiɑŋ"],
|
||||
"qiao": ["tɕiaʌ"],
|
||||
"qie": ["tɕie"],
|
||||
"qin": ["tɕin"],
|
||||
"qing": ["tɕɨŋ"],
|
||||
"qiong": ["tɕioŋ"],
|
||||
"qiu": ["tɕio"],
|
||||
"qu": ["tɕy"],
|
||||
"quan": ["tɕyɛn"],
|
||||
"que": ["tɕye"],
|
||||
"qun": ["tɕyn"],
|
||||
"ran": ["ʐan"],
|
||||
"rang": ["ʐɑŋ"],
|
||||
"rao": ["ʐaʌ"],
|
||||
"re": ["ʐø"],
|
||||
"ren": ["ʐœn"],
|
||||
"reng": ["ʐɵŋ"],
|
||||
"ri": ["ʐʏ"],
|
||||
"rong": ["ʐoŋ"],
|
||||
"rou": ["ʐou"],
|
||||
"ru": ["ʐu"],
|
||||
"rua": ["ʐua"],
|
||||
"ruan": ["ʐuan"],
|
||||
"rui": ["ʐuei"],
|
||||
"run": ["ʐun"],
|
||||
"ruo": ["ʐuo"],
|
||||
"sa": ["sa"],
|
||||
"sai": ["sai"],
|
||||
"san": ["san"],
|
||||
"sang": ["sɑŋ"],
|
||||
"sao": ["saʌ"],
|
||||
"se": ["sø"],
|
||||
"sen": ["sœn"],
|
||||
"seng": ["sɵŋ"],
|
||||
"sha": ["ʂa"],
|
||||
"shai": ["ʂai"],
|
||||
"shan": ["ʂan"],
|
||||
"shang": ["ʂɑŋ"],
|
||||
"shao": ["ʂaʌ"],
|
||||
"she": ["ʂø"],
|
||||
"shei": ["ʂei"],
|
||||
"shen": ["ʂœn"],
|
||||
"sheng": ["ʂɵŋ"],
|
||||
"shi": ["ʂʏ"],
|
||||
"shou": ["ʂou"],
|
||||
"shu": ["ʂu"],
|
||||
"shua": ["ʂua"],
|
||||
"shuai": ["ʂuai"],
|
||||
"shuan": ["ʂuan"],
|
||||
"shuang": ["ʂuɑŋ"],
|
||||
"shui": ["ʂuei"],
|
||||
"shun": ["ʂun"],
|
||||
"shuo": ["ʂuo"],
|
||||
"si": ["sɪ"],
|
||||
"song": ["soŋ"],
|
||||
"sou": ["sou"],
|
||||
"su": ["su"],
|
||||
"suan": ["suan"],
|
||||
"sui": ["suei"],
|
||||
"sun": ["sun"],
|
||||
"suo": ["suo"],
|
||||
"ta": ["ta"],
|
||||
"tai": ["tai"],
|
||||
"tan": ["tan"],
|
||||
"tang": ["tɑŋ"],
|
||||
"tao": ["taʌ"],
|
||||
"te": ["tø"],
|
||||
"tei": ["tei"],
|
||||
"teng": ["tɵŋ"],
|
||||
"ti": ["ti"],
|
||||
"tian": ["tiɛn"],
|
||||
"tiao": ["tiaʌ"],
|
||||
"tie": ["tie"],
|
||||
"ting": ["tɨŋ"],
|
||||
"tong": ["toŋ"],
|
||||
"tou": ["tou"],
|
||||
"tu": ["tu"],
|
||||
"tuan": ["tuan"],
|
||||
"tui": ["tuei"],
|
||||
"tun": ["tun"],
|
||||
"tuo": ["tuo"],
|
||||
"wa": ["wa"],
|
||||
"wai": ["wai"],
|
||||
"wan": ["wan"],
|
||||
"wang": ["wɑŋ"],
|
||||
"wei": ["wei"],
|
||||
"wen": ["wœn"],
|
||||
"weng": ["wɵŋ"],
|
||||
"wo": ["wo"],
|
||||
"wu": ["wu"],
|
||||
"xi": ["ɕi"],
|
||||
"xia": ["ɕia"],
|
||||
"xian": ["ɕiɛn"],
|
||||
"xiang": ["ɕiɑŋ"],
|
||||
"xiao": ["ɕiaʌ"],
|
||||
"xie": ["ɕie"],
|
||||
"xin": ["ɕin"],
|
||||
"xing": ["ɕɨŋ"],
|
||||
"xiong": ["ɕioŋ"],
|
||||
"xiu": ["ɕio"],
|
||||
"xu": ["ɕy"],
|
||||
"xuan": ["ɕyɛn"],
|
||||
"xue": ["ɕye"],
|
||||
"xun": ["ɕyn"],
|
||||
"ya": ["ia"],
|
||||
"yan": ["iɛn"],
|
||||
"yang": ["iɑŋ"],
|
||||
"yao": ["iaʌ"],
|
||||
"ye": ["ie"],
|
||||
"yi": ["i"],
|
||||
"yin": ["in"],
|
||||
"ying": ["ɨŋ"],
|
||||
"yo": ["io"],
|
||||
"yong": ["ioŋ"],
|
||||
"you": ["io"],
|
||||
"yu": ["y"],
|
||||
"yuan": ["yɛn"],
|
||||
"yue": ["ye"],
|
||||
"yun": ["yn"],
|
||||
"za": ["dza"],
|
||||
"zai": ["dzai"],
|
||||
"zan": ["dzan"],
|
||||
"zang": ["dzɑŋ"],
|
||||
"zao": ["dzaʌ"],
|
||||
"ze": ["dzø"],
|
||||
"zei": ["dzei"],
|
||||
"zen": ["dzœn"],
|
||||
"zeng": ["dzɵŋ"],
|
||||
"zha": ["dʒa"],
|
||||
"zhai": ["dʒai"],
|
||||
"zhan": ["dʒan"],
|
||||
"zhang": ["dʒɑŋ"],
|
||||
"zhao": ["dʒaʌ"],
|
||||
"zhe": ["dʒø"],
|
||||
# "zhei": ["dʒei"], it doesn't exist
|
||||
"zhen": ["dʒœn"],
|
||||
"zheng": ["dʒɵŋ"],
|
||||
"zhi": ["dʒʏ"],
|
||||
"zhong": ["dʒoŋ"],
|
||||
"zhou": ["dʒou"],
|
||||
"zhu": ["dʒu"],
|
||||
"zhua": ["dʒua"],
|
||||
"zhuai": ["dʒuai"],
|
||||
"zhuan": ["dʒuan"],
|
||||
"zhuang": ["dʒuɑŋ"],
|
||||
"zhui": ["dʒuei"],
|
||||
"zhun": ["dʒun"],
|
||||
"zhuo": ["dʒuo"],
|
||||
"zi": ["dzɪ"],
|
||||
"zong": ["dzoŋ"],
|
||||
"zou": ["dzou"],
|
||||
"zu": ["dzu"],
|
||||
"zuan": ["dzuan"],
|
||||
"zui": ["dzuei"],
|
||||
"zun": ["dzun"],
|
||||
"zuo": ["dzuo"],
|
||||
}
|
|
@ -219,6 +219,7 @@ def synthesis(model,
|
|||
ap (TTS.tts.utils.audio.AudioProcessor): audio processor to process
|
||||
model outputs.
|
||||
speaker_id (int): id of speaker
|
||||
style_wav (str | Dict[str, float]): Uses for style embedding of GST.
|
||||
style_wav (str): Uses for style embedding of GST.
|
||||
truncated (bool): keep model states after inference. It can be used
|
||||
for continuous inference at long texts.
|
||||
|
|
|
@ -8,6 +8,7 @@ from phonemizer.phonemize import phonemize
|
|||
from TTS.tts.utils.text import cleaners
|
||||
from TTS.tts.utils.text.symbols import (_bos, _eos, _punctuations,
|
||||
make_symbols, phonemes, symbols)
|
||||
from TTS.tts.utils.chinese_mandarin.phonemizer import chinese_text_to_phonemes
|
||||
|
||||
|
||||
# pylint: disable=unnecessary-comprehension
|
||||
|
@ -29,8 +30,23 @@ PHONEME_PUNCTUATION_PATTERN = r'['+_punctuations.replace(' ', '')+']+'
|
|||
|
||||
def text2phone(text, language):
|
||||
'''
|
||||
Convert graphemes to phonemes.
|
||||
Convert graphemes to phonemes. For most of the languages, it calls
|
||||
the phonemizer python library that calls espeak/espeak-ng. For chinese
|
||||
mandarin, it calls pypinyin + custom function for phonemizing
|
||||
Parameters:
|
||||
text (str): text to phonemize
|
||||
language (str): language of the text
|
||||
Returns:
|
||||
ph (str): phonemes as a string seperated by "|"
|
||||
ph = "ɪ|g|ˈ|z|æ|m|p|ə|l"
|
||||
'''
|
||||
|
||||
# TO REVIEW : How to have a good implementation for this?
|
||||
if language == "chinese-mandarin":
|
||||
ph = chinese_text_to_phonemes(text)
|
||||
return ph
|
||||
|
||||
|
||||
seperator = phonemizer.separator.Separator(' |', '', '|')
|
||||
#try:
|
||||
punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text)
|
||||
|
|
|
@ -15,6 +15,8 @@ from unidecode import unidecode
|
|||
from .number_norm import normalize_numbers
|
||||
from .abbreviations import abbreviations_en, abbreviations_fr
|
||||
from .time import expand_time_english
|
||||
from TTS.tts.utils.chinese_mandarin.numbers import replace_numbers_to_characters_in_text
|
||||
|
||||
|
||||
# Regular expression matching whitespace:
|
||||
_whitespace_re = re.compile(r'\s+')
|
||||
|
@ -122,6 +124,13 @@ def portuguese_cleaners(text):
|
|||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
def chinese_mandarin_cleaners(text: str) -> str:
|
||||
'''Basic pipeline for chinese'''
|
||||
text = replace_numbers_to_characters_in_text(text)
|
||||
return text
|
||||
|
||||
|
||||
|
||||
def phoneme_cleaners(text):
|
||||
'''Pipeline for phonemes mode, including number and abbreviation expansion.'''
|
||||
text = expand_numbers(text)
|
||||
|
|
|
@ -122,6 +122,13 @@ class Synthesizer(object):
|
|||
speaker_embedding = self.init_speaker(speaker_idx)
|
||||
use_gl = self.vocoder_model is None
|
||||
|
||||
|
||||
# check if compute gst style
|
||||
gst_style_input = None
|
||||
if self.tts_config.use_gst:
|
||||
if self.tts_config.gst["gst_style_input"] not in ["", {}]:
|
||||
style_wav = self.tts_config.gst["gst_style_input"]
|
||||
|
||||
for sen in sens:
|
||||
# synthesize voice
|
||||
waveform, _, _, mel_postnet_spec, _, _ = synthesis(
|
||||
|
@ -131,7 +138,7 @@ class Synthesizer(object):
|
|||
self.use_cuda,
|
||||
self.ap,
|
||||
speaker_idx,
|
||||
None,
|
||||
gst_style_input,
|
||||
False,
|
||||
self.tts_config.enable_eos_bos_chars,
|
||||
use_gl,
|
||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue