mirror of https://github.com/coqui-ai/TTS.git
<add> Chinese mandarin implementation (tacotron2)
This commit is contained in:
parent
eb543c027e
commit
c4c7bc1b88
|
@ -75,6 +75,16 @@
|
||||||
"contact":"erengolge@gmail.com"
|
"contact":"erengolge@gmail.com"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"zh":{
|
||||||
|
"baker":{
|
||||||
|
"tacotron2-DDC-GST":{
|
||||||
|
"model_file": "1RR9rZdV_FMm8yvtCHALtUbJf1nxbUiAw",
|
||||||
|
"config_file": "1daY1JHGXEozJ-MGYLiWEUmzEwEvM5xpz",
|
||||||
|
"stats_file": "1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV",
|
||||||
|
"commit": ""
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"vocoder_models":{
|
"vocoder_models":{
|
||||||
|
|
|
@ -352,3 +352,19 @@ def _voxcel_x(root_path, meta_file, voxcel_idx):
|
||||||
|
|
||||||
with open(str(cache_to), 'r') as f:
|
with open(str(cache_to), 'r') as f:
|
||||||
return [x.strip().split('|') for x in f.readlines()]
|
return [x.strip().split('|') for x in f.readlines()]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================== Baker (chinese mandarin single speaker) ===========================================
|
||||||
|
def baker(root_path, meta_file):
|
||||||
|
"""Normalizes the Baker meta data file to TTS format"""
|
||||||
|
txt_file = os.path.join(root_path, meta_file)
|
||||||
|
items = []
|
||||||
|
speaker_name = "baker"
|
||||||
|
with open(txt_file, 'r') as ttf:
|
||||||
|
for line in ttf:
|
||||||
|
wav_name, text = line.rstrip('\n').split("|")
|
||||||
|
wav_path = os.path.join(root_path, "clips_22", wav_name)
|
||||||
|
items.append([text, wav_path, speaker_name])
|
||||||
|
return items
|
||||||
|
|
|
@ -0,0 +1,107 @@
|
||||||
|
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Licensed under WTFPL or the Unlicense or CC0.
|
||||||
|
# This uses Python 3, but it's easy to port to Python 2 by changing
|
||||||
|
# strings to u'xx'.
|
||||||
|
|
||||||
|
import re
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
|
||||||
|
def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False):
|
||||||
|
"""
|
||||||
|
Converts numbers to Chinese representations.
|
||||||
|
`big` : use financial characters.
|
||||||
|
`simp` : use simplified characters instead of traditional characters.
|
||||||
|
`o` : use 〇 for zero.
|
||||||
|
`twoalt`: use 两/兩 for two when appropriate.
|
||||||
|
Note that `o` and `twoalt` is ignored when `big` is used,
|
||||||
|
and `twoalt` is ignored when `o` is used for formal representations.
|
||||||
|
"""
|
||||||
|
# check num first
|
||||||
|
nd = str(num)
|
||||||
|
if abs(float(nd)) >= 1e48:
|
||||||
|
raise ValueError('number out of range')
|
||||||
|
elif 'e' in nd:
|
||||||
|
raise ValueError('scientific notation is not supported')
|
||||||
|
c_symbol = '正负点' if simp else '正負點'
|
||||||
|
if o: # formal
|
||||||
|
twoalt = False
|
||||||
|
if big:
|
||||||
|
c_basic = '零壹贰叁肆伍陆柒捌玖' if simp else '零壹貳參肆伍陸柒捌玖'
|
||||||
|
c_unit1 = '拾佰仟'
|
||||||
|
c_twoalt = '贰' if simp else '貳'
|
||||||
|
else:
|
||||||
|
c_basic = '〇一二三四五六七八九' if o else '零一二三四五六七八九'
|
||||||
|
c_unit1 = '十百千'
|
||||||
|
if twoalt:
|
||||||
|
c_twoalt = '两' if simp else '兩'
|
||||||
|
else:
|
||||||
|
c_twoalt = '二'
|
||||||
|
c_unit2 = '万亿兆京垓秭穰沟涧正载' if simp else '萬億兆京垓秭穰溝澗正載'
|
||||||
|
revuniq = lambda l: ''.join(k for k, g in itertools.groupby(reversed(l)))
|
||||||
|
nd = str(num)
|
||||||
|
result = []
|
||||||
|
if nd[0] == '+':
|
||||||
|
result.append(c_symbol[0])
|
||||||
|
elif nd[0] == '-':
|
||||||
|
result.append(c_symbol[1])
|
||||||
|
if '.' in nd:
|
||||||
|
integer, remainder = nd.lstrip('+-').split('.')
|
||||||
|
else:
|
||||||
|
integer, remainder = nd.lstrip('+-'), None
|
||||||
|
if int(integer):
|
||||||
|
splitted = [integer[max(i - 4, 0):i]
|
||||||
|
for i in range(len(integer), 0, -4)]
|
||||||
|
intresult = []
|
||||||
|
for nu, unit in enumerate(splitted):
|
||||||
|
# special cases
|
||||||
|
if int(unit) == 0: # 0000
|
||||||
|
intresult.append(c_basic[0])
|
||||||
|
continue
|
||||||
|
elif nu > 0 and int(unit) == 2: # 0002
|
||||||
|
intresult.append(c_twoalt + c_unit2[nu - 1])
|
||||||
|
continue
|
||||||
|
ulist = []
|
||||||
|
unit = unit.zfill(4)
|
||||||
|
for nc, ch in enumerate(reversed(unit)):
|
||||||
|
if ch == '0':
|
||||||
|
if ulist: # ???0
|
||||||
|
ulist.append(c_basic[0])
|
||||||
|
elif nc == 0:
|
||||||
|
ulist.append(c_basic[int(ch)])
|
||||||
|
elif nc == 1 and ch == '1' and unit[1] == '0':
|
||||||
|
# special case for tens
|
||||||
|
# edit the 'elif' if you don't like
|
||||||
|
# 十四, 三千零十四, 三千三百一十四
|
||||||
|
ulist.append(c_unit1[0])
|
||||||
|
elif nc > 1 and ch == '2':
|
||||||
|
ulist.append(c_twoalt + c_unit1[nc - 1])
|
||||||
|
else:
|
||||||
|
ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
|
||||||
|
ustr = revuniq(ulist)
|
||||||
|
if nu == 0:
|
||||||
|
intresult.append(ustr)
|
||||||
|
else:
|
||||||
|
intresult.append(ustr + c_unit2[nu - 1])
|
||||||
|
result.append(revuniq(intresult).strip(c_basic[0]))
|
||||||
|
else:
|
||||||
|
result.append(c_basic[0])
|
||||||
|
if remainder:
|
||||||
|
result.append(c_symbol[2])
|
||||||
|
result.append(''.join(c_basic[int(ch)] for ch in remainder))
|
||||||
|
return ''.join(result)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _number_replace(match : re.Match):
|
||||||
|
match_str: str = match.group()
|
||||||
|
return _num2chinese(match_str)
|
||||||
|
|
||||||
|
|
||||||
|
def replace_numbers_to_characters_in_text(text : str):
|
||||||
|
text = re.sub(r'[0-9]+', _number_replace, text)
|
||||||
|
return text
|
|
@ -0,0 +1,41 @@
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import pypinyin
|
||||||
|
|
||||||
|
from .pinyinToPhonemes import PINYIN_DICT
|
||||||
|
|
||||||
|
|
||||||
|
import jieba
|
||||||
|
|
||||||
|
|
||||||
|
def _chinese_character_to_pinyin(text: str) -> List[str]:
|
||||||
|
pinyins = pypinyin.pinyin(
|
||||||
|
text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True
|
||||||
|
)
|
||||||
|
pinyins_flat_list = [item for sublist in pinyins for item in sublist]
|
||||||
|
return pinyins_flat_list
|
||||||
|
|
||||||
|
|
||||||
|
def _chinese_pinyin_to_phoneme(pinyin: str) -> str:
|
||||||
|
segment = pinyin[:-1]
|
||||||
|
tone = pinyin[-1]
|
||||||
|
phoneme = PINYIN_DICT.get(segment, [""])[0]
|
||||||
|
return phoneme + tone
|
||||||
|
|
||||||
|
|
||||||
|
def chinese_text_to_phonemes(text: str) -> str:
|
||||||
|
tokenized_text = jieba.cut(text, HMM=False)
|
||||||
|
tokenized_text = " ".join(tokenized_text)
|
||||||
|
pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text)
|
||||||
|
|
||||||
|
results: List[str] = []
|
||||||
|
|
||||||
|
for token in pinyined_text:
|
||||||
|
if token[-1] in "12345": # TODO transform to is_pinyin()
|
||||||
|
pinyin_phonemes = _chinese_pinyin_to_phoneme(token)
|
||||||
|
|
||||||
|
results += list(pinyin_phonemes)
|
||||||
|
else: # is ponctuation or other
|
||||||
|
results += list(token)
|
||||||
|
|
||||||
|
return "|".join(results)
|
|
@ -0,0 +1,420 @@
|
||||||
|
|
||||||
|
PINYIN_DICT = {
|
||||||
|
"a": ["a"],
|
||||||
|
"ai": ["ai"],
|
||||||
|
"an": ["an"],
|
||||||
|
"ang": ["ɑŋ"],
|
||||||
|
"ao": ["aʌ"],
|
||||||
|
"ba": ["ba"],
|
||||||
|
"bai": ["bai"],
|
||||||
|
"ban": ["ban"],
|
||||||
|
"bang": ["bɑŋ"],
|
||||||
|
"bao": ["baʌ"],
|
||||||
|
# "be": ["be"], doesnt exist
|
||||||
|
"bei": ["bɛi"],
|
||||||
|
"ben": ["bœn"],
|
||||||
|
"beng": ["bɵŋ"],
|
||||||
|
"bi": ["bi"],
|
||||||
|
"bian": ["biɛn"],
|
||||||
|
"biao": ["biaʌ"],
|
||||||
|
"bie": ["bie"],
|
||||||
|
"bin": ["bin"],
|
||||||
|
"bing": ["bɨŋ"],
|
||||||
|
"bo": ["bo"],
|
||||||
|
"bu": ["bu"],
|
||||||
|
"ca": ["tsa"],
|
||||||
|
"cai": ["tsai"],
|
||||||
|
"can": ["tsan"],
|
||||||
|
"cang": ["tsɑŋ"],
|
||||||
|
"cao": ["tsaʌ"],
|
||||||
|
"ce": ["tsø"],
|
||||||
|
"cen": ["tsœn"],
|
||||||
|
"ceng": ["tsɵŋ"],
|
||||||
|
"cha": ["ʈʂa"],
|
||||||
|
"chai": ["ʈʂai"],
|
||||||
|
"chan": ["ʈʂan"],
|
||||||
|
"chang": ["ʈʂɑŋ"],
|
||||||
|
"chao": ["ʈʂaʌ"],
|
||||||
|
"che": ["ʈʂø"],
|
||||||
|
"chen": ["ʈʂœn"],
|
||||||
|
"cheng": ["ʈʂɵŋ"],
|
||||||
|
"chi": ["ʈʂʏ"],
|
||||||
|
"chong": ["ʈʂoŋ"],
|
||||||
|
"chou": ["ʈʂou"],
|
||||||
|
"chu": ["ʈʂu"],
|
||||||
|
"chua": ["ʈʂua"],
|
||||||
|
"chuai": ["ʈʂuai"],
|
||||||
|
"chuan": ["ʈʂuan"],
|
||||||
|
"chuang": ["ʈʂuɑŋ"],
|
||||||
|
"chui": ["ʈʂuei"],
|
||||||
|
"chun": ["ʈʂun"],
|
||||||
|
"chuo": ["ʈʂuo"],
|
||||||
|
"ci": ["tsɪ"],
|
||||||
|
"cong": ["tsoŋ"],
|
||||||
|
"cou": ["tsou"],
|
||||||
|
"cu": ["tsu"],
|
||||||
|
"cuan": ["tsuan"],
|
||||||
|
"cui": ["tsuei"],
|
||||||
|
"cun": ["tsun"],
|
||||||
|
"cuo": ["tsuo"],
|
||||||
|
"da": ["da"],
|
||||||
|
"dai": ["dai"],
|
||||||
|
"dan": ["dan"],
|
||||||
|
"dang": ["dɑŋ"],
|
||||||
|
"dao": ["daʌ"],
|
||||||
|
"de": ["dø"],
|
||||||
|
"dei": ["dei"],
|
||||||
|
# "den": ["dœn"],
|
||||||
|
"deng": ["dɵŋ"],
|
||||||
|
"di": ["di"],
|
||||||
|
"dia": ["dia"],
|
||||||
|
"dian": ["diɛn"],
|
||||||
|
"diao": ["diaʌ"],
|
||||||
|
"die": ["die"],
|
||||||
|
"ding": ["dɨŋ"],
|
||||||
|
"diu": ["dio"],
|
||||||
|
"dong": ["doŋ"],
|
||||||
|
"dou": ["dou"],
|
||||||
|
"du": ["du"],
|
||||||
|
"duan": ["duan"],
|
||||||
|
"dui": ["duei"],
|
||||||
|
"dun": ["dun"],
|
||||||
|
"duo": ["duo"],
|
||||||
|
"e": ["ø"],
|
||||||
|
"ei": ["ei"],
|
||||||
|
"en": ["œn"],
|
||||||
|
# "ng": ["œn"],
|
||||||
|
# "eng": ["ɵŋ"],
|
||||||
|
"er": ["er"],
|
||||||
|
"fa": ["fa"],
|
||||||
|
"fan": ["fan"],
|
||||||
|
"fang": ["fɑŋ"],
|
||||||
|
"fei": ["fei"],
|
||||||
|
"fen": ["fœn"],
|
||||||
|
"feng": ["fɵŋ"],
|
||||||
|
"fo": ["fo"],
|
||||||
|
"fou": ["fou"],
|
||||||
|
"fu": ["fu"],
|
||||||
|
"ga": ["ga"],
|
||||||
|
"gai": ["gai"],
|
||||||
|
"gan": ["gan"],
|
||||||
|
"gang": ["gɑŋ"],
|
||||||
|
"gao": ["gaʌ"],
|
||||||
|
"ge": ["gø"],
|
||||||
|
"gei": ["gei"],
|
||||||
|
"gen": ["gœn"],
|
||||||
|
"geng": ["gɵŋ"],
|
||||||
|
"gong": ["goŋ"],
|
||||||
|
"gou": ["gou"],
|
||||||
|
"gu": ["gu"],
|
||||||
|
"gua": ["gua"],
|
||||||
|
"guai": ["guai"],
|
||||||
|
"guan": ["guan"],
|
||||||
|
"guang": ["guɑŋ"],
|
||||||
|
"gui": ["guei"],
|
||||||
|
"gun": ["gun"],
|
||||||
|
"guo": ["guo"],
|
||||||
|
"ha": ["xa"],
|
||||||
|
"hai": ["xai"],
|
||||||
|
"han": ["xan"],
|
||||||
|
"hang": ["xɑŋ"],
|
||||||
|
"hao": ["xaʌ"],
|
||||||
|
"he": ["xø"],
|
||||||
|
"hei": ["xei"],
|
||||||
|
"hen": ["xœn"],
|
||||||
|
"heng": ["xɵŋ"],
|
||||||
|
"hong": ["xoŋ"],
|
||||||
|
"hou": ["xou"],
|
||||||
|
"hu": ["xu"],
|
||||||
|
"hua": ["xua"],
|
||||||
|
"huai": ["xuai"],
|
||||||
|
"huan": ["xuan"],
|
||||||
|
"huang": ["xuɑŋ"],
|
||||||
|
"hui": ["xuei"],
|
||||||
|
"hun": ["xun"],
|
||||||
|
"huo": ["xuo"],
|
||||||
|
"ji": ["dʑi"],
|
||||||
|
"jia": ["dʑia"],
|
||||||
|
"jian": ["dʑiɛn"],
|
||||||
|
"jiang": ["dʑiɑŋ"],
|
||||||
|
"jiao": ["dʑiaʌ"],
|
||||||
|
"jie": ["dʑie"],
|
||||||
|
"jin": ["dʑin"],
|
||||||
|
"jing": ["dʑɨŋ"],
|
||||||
|
"jiong": ["dʑioŋ"],
|
||||||
|
"jiu": ["dʑio"],
|
||||||
|
"ju": ["dʑy"],
|
||||||
|
"juan": ["dʑyɛn"],
|
||||||
|
"jue": ["dʑye"],
|
||||||
|
"jun": ["dʑyn"],
|
||||||
|
"ka": ["ka"],
|
||||||
|
"kai": ["kai"],
|
||||||
|
"kan": ["kan"],
|
||||||
|
"kang": ["kɑŋ"],
|
||||||
|
"kao": ["kaʌ"],
|
||||||
|
"ke": ["kø"],
|
||||||
|
"kei": ["kei"],
|
||||||
|
"ken": ["kœn"],
|
||||||
|
"keng": ["kɵŋ"],
|
||||||
|
"kong": ["koŋ"],
|
||||||
|
"kou": ["kou"],
|
||||||
|
"ku": ["ku"],
|
||||||
|
"kua": ["kua"],
|
||||||
|
"kuai": ["kuai"],
|
||||||
|
"kuan": ["kuan"],
|
||||||
|
"kuang": ["kuɑŋ"],
|
||||||
|
"kui": ["kuei"],
|
||||||
|
"kun": ["kun"],
|
||||||
|
"kuo": ["kuo"],
|
||||||
|
"la": ["la"],
|
||||||
|
"lai": ["lai"],
|
||||||
|
"lan": ["lan"],
|
||||||
|
"lang": ["lɑŋ"],
|
||||||
|
"lao": ["laʌ"],
|
||||||
|
"le": ["lø"],
|
||||||
|
"lei": ["lei"],
|
||||||
|
"leng": ["lɵŋ"],
|
||||||
|
"li": ["li"],
|
||||||
|
"lia": ["lia"],
|
||||||
|
"lian": ["liɛn"],
|
||||||
|
"liang": ["liɑŋ"],
|
||||||
|
"liao": ["liaʌ"],
|
||||||
|
"lie": ["lie"],
|
||||||
|
"lin": ["lin"],
|
||||||
|
"ling": ["lɨŋ"],
|
||||||
|
"liu": ["lio"],
|
||||||
|
"lo": ["lo"],
|
||||||
|
"long": ["loŋ"],
|
||||||
|
"lou": ["lou"],
|
||||||
|
"lu": ["lu"],
|
||||||
|
"lv": ["ly"],
|
||||||
|
"luan": ["luan"],
|
||||||
|
"lve": ["lye"],
|
||||||
|
"lue": ["lue"],
|
||||||
|
"lun": ["lun"],
|
||||||
|
"luo": ["luo"],
|
||||||
|
"ma": ["ma"],
|
||||||
|
"mai": ["mai"],
|
||||||
|
"man": ["man"],
|
||||||
|
"mang": ["mɑŋ"],
|
||||||
|
"mao": ["maʌ"],
|
||||||
|
"me": ["mø"],
|
||||||
|
"mei": ["mei"],
|
||||||
|
"men": ["mœn"],
|
||||||
|
"meng": ["mɵŋ"],
|
||||||
|
"mi": ["mi"],
|
||||||
|
"mian": ["miɛn"],
|
||||||
|
"miao": ["miaʌ"],
|
||||||
|
"mie": ["mie"],
|
||||||
|
"min": ["min"],
|
||||||
|
"ming": ["mɨŋ"],
|
||||||
|
"miu": ["mio"],
|
||||||
|
"mo": ["mo"],
|
||||||
|
"mou": ["mou"],
|
||||||
|
"mu": ["mu"],
|
||||||
|
"na": ["na"],
|
||||||
|
"nai": ["nai"],
|
||||||
|
"nan": ["nan"],
|
||||||
|
"nang": ["nɑŋ"],
|
||||||
|
"nao": ["naʌ"],
|
||||||
|
"ne": ["nø"],
|
||||||
|
"nei": ["nei"],
|
||||||
|
"nen": ["nœn"],
|
||||||
|
"neng": ["nɵŋ"],
|
||||||
|
"ni": ["ni"],
|
||||||
|
"nia": ["nia"],
|
||||||
|
"nian": ["niɛn"],
|
||||||
|
"niang": ["niɑŋ"],
|
||||||
|
"niao": ["niaʌ"],
|
||||||
|
"nie": ["nie"],
|
||||||
|
"nin": ["nin"],
|
||||||
|
"ning": ["nɨŋ"],
|
||||||
|
"niu": ["nio"],
|
||||||
|
"nong": ["noŋ"],
|
||||||
|
"nou": ["nou"],
|
||||||
|
"nu": ["nu"],
|
||||||
|
"nv": ["ny"],
|
||||||
|
"nuan": ["nuan"],
|
||||||
|
"nve": ["nye"],
|
||||||
|
"nue": ["nye"],
|
||||||
|
"nuo": ["nuo"],
|
||||||
|
"o": ["o"],
|
||||||
|
"ou": ["ou"],
|
||||||
|
"pa": ["pa"],
|
||||||
|
"pai": ["pai"],
|
||||||
|
"pan": ["pan"],
|
||||||
|
"pang": ["pɑŋ"],
|
||||||
|
"pao": ["paʌ"],
|
||||||
|
"pe": ["pø"],
|
||||||
|
"pei": ["pei"],
|
||||||
|
"pen": ["pœn"],
|
||||||
|
"peng": ["pɵŋ"],
|
||||||
|
"pi": ["pi"],
|
||||||
|
"pian": ["piɛn"],
|
||||||
|
"piao": ["piaʌ"],
|
||||||
|
"pie": ["pie"],
|
||||||
|
"pin": ["pin"],
|
||||||
|
"ping": ["pɨŋ"],
|
||||||
|
"po": ["po"],
|
||||||
|
"pou": ["pou"],
|
||||||
|
"pu": ["pu"],
|
||||||
|
"qi": ["tɕi"],
|
||||||
|
"qia": ["tɕia"],
|
||||||
|
"qian": ["tɕiɛn"],
|
||||||
|
"qiang": ["tɕiɑŋ"],
|
||||||
|
"qiao": ["tɕiaʌ"],
|
||||||
|
"qie": ["tɕie"],
|
||||||
|
"qin": ["tɕin"],
|
||||||
|
"qing": ["tɕɨŋ"],
|
||||||
|
"qiong": ["tɕioŋ"],
|
||||||
|
"qiu": ["tɕio"],
|
||||||
|
"qu": ["tɕy"],
|
||||||
|
"quan": ["tɕyɛn"],
|
||||||
|
"que": ["tɕye"],
|
||||||
|
"qun": ["tɕyn"],
|
||||||
|
"ran": ["ʐan"],
|
||||||
|
"rang": ["ʐɑŋ"],
|
||||||
|
"rao": ["ʐaʌ"],
|
||||||
|
"re": ["ʐø"],
|
||||||
|
"ren": ["ʐœn"],
|
||||||
|
"reng": ["ʐɵŋ"],
|
||||||
|
"ri": ["ʐʏ"],
|
||||||
|
"rong": ["ʐoŋ"],
|
||||||
|
"rou": ["ʐou"],
|
||||||
|
"ru": ["ʐu"],
|
||||||
|
"rua": ["ʐua"],
|
||||||
|
"ruan": ["ʐuan"],
|
||||||
|
"rui": ["ʐuei"],
|
||||||
|
"run": ["ʐun"],
|
||||||
|
"ruo": ["ʐuo"],
|
||||||
|
"sa": ["sa"],
|
||||||
|
"sai": ["sai"],
|
||||||
|
"san": ["san"],
|
||||||
|
"sang": ["sɑŋ"],
|
||||||
|
"sao": ["saʌ"],
|
||||||
|
"se": ["sø"],
|
||||||
|
"sen": ["sœn"],
|
||||||
|
"seng": ["sɵŋ"],
|
||||||
|
"sha": ["ʂa"],
|
||||||
|
"shai": ["ʂai"],
|
||||||
|
"shan": ["ʂan"],
|
||||||
|
"shang": ["ʂɑŋ"],
|
||||||
|
"shao": ["ʂaʌ"],
|
||||||
|
"she": ["ʂø"],
|
||||||
|
"shei": ["ʂei"],
|
||||||
|
"shen": ["ʂœn"],
|
||||||
|
"sheng": ["ʂɵŋ"],
|
||||||
|
"shi": ["ʂʏ"],
|
||||||
|
"shou": ["ʂou"],
|
||||||
|
"shu": ["ʂu"],
|
||||||
|
"shua": ["ʂua"],
|
||||||
|
"shuai": ["ʂuai"],
|
||||||
|
"shuan": ["ʂuan"],
|
||||||
|
"shuang": ["ʂuɑŋ"],
|
||||||
|
"shui": ["ʂuei"],
|
||||||
|
"shun": ["ʂun"],
|
||||||
|
"shuo": ["ʂuo"],
|
||||||
|
"si": ["sɪ"],
|
||||||
|
"song": ["soŋ"],
|
||||||
|
"sou": ["sou"],
|
||||||
|
"su": ["su"],
|
||||||
|
"suan": ["suan"],
|
||||||
|
"sui": ["suei"],
|
||||||
|
"sun": ["sun"],
|
||||||
|
"suo": ["suo"],
|
||||||
|
"ta": ["ta"],
|
||||||
|
"tai": ["tai"],
|
||||||
|
"tan": ["tan"],
|
||||||
|
"tang": ["tɑŋ"],
|
||||||
|
"tao": ["taʌ"],
|
||||||
|
"te": ["tø"],
|
||||||
|
"tei": ["tei"],
|
||||||
|
"teng": ["tɵŋ"],
|
||||||
|
"ti": ["ti"],
|
||||||
|
"tian": ["tiɛn"],
|
||||||
|
"tiao": ["tiaʌ"],
|
||||||
|
"tie": ["tie"],
|
||||||
|
"ting": ["tɨŋ"],
|
||||||
|
"tong": ["toŋ"],
|
||||||
|
"tou": ["tou"],
|
||||||
|
"tu": ["tu"],
|
||||||
|
"tuan": ["tuan"],
|
||||||
|
"tui": ["tuei"],
|
||||||
|
"tun": ["tun"],
|
||||||
|
"tuo": ["tuo"],
|
||||||
|
"wa": ["wa"],
|
||||||
|
"wai": ["wai"],
|
||||||
|
"wan": ["wan"],
|
||||||
|
"wang": ["wɑŋ"],
|
||||||
|
"wei": ["wei"],
|
||||||
|
"wen": ["wœn"],
|
||||||
|
"weng": ["wɵŋ"],
|
||||||
|
"wo": ["wo"],
|
||||||
|
"wu": ["wu"],
|
||||||
|
"xi": ["ɕi"],
|
||||||
|
"xia": ["ɕia"],
|
||||||
|
"xian": ["ɕiɛn"],
|
||||||
|
"xiang": ["ɕiɑŋ"],
|
||||||
|
"xiao": ["ɕiaʌ"],
|
||||||
|
"xie": ["ɕie"],
|
||||||
|
"xin": ["ɕin"],
|
||||||
|
"xing": ["ɕɨŋ"],
|
||||||
|
"xiong": ["ɕioŋ"],
|
||||||
|
"xiu": ["ɕio"],
|
||||||
|
"xu": ["ɕy"],
|
||||||
|
"xuan": ["ɕyɛn"],
|
||||||
|
"xue": ["ɕye"],
|
||||||
|
"xun": ["ɕyn"],
|
||||||
|
"ya": ["ia"],
|
||||||
|
"yan": ["iɛn"],
|
||||||
|
"yang": ["iɑŋ"],
|
||||||
|
"yao": ["iaʌ"],
|
||||||
|
"ye": ["ie"],
|
||||||
|
"yi": ["i"],
|
||||||
|
"yin": ["in"],
|
||||||
|
"ying": ["ɨŋ"],
|
||||||
|
"yo": ["io"],
|
||||||
|
"yong": ["ioŋ"],
|
||||||
|
"you": ["io"],
|
||||||
|
"yu": ["y"],
|
||||||
|
"yuan": ["yɛn"],
|
||||||
|
"yue": ["ye"],
|
||||||
|
"yun": ["yn"],
|
||||||
|
"za": ["dza"],
|
||||||
|
"zai": ["dzai"],
|
||||||
|
"zan": ["dzan"],
|
||||||
|
"zang": ["dzɑŋ"],
|
||||||
|
"zao": ["dzaʌ"],
|
||||||
|
"ze": ["dzø"],
|
||||||
|
"zei": ["dzei"],
|
||||||
|
"zen": ["dzœn"],
|
||||||
|
"zeng": ["dzɵŋ"],
|
||||||
|
"zha": ["dʒa"],
|
||||||
|
"zhai": ["dʒai"],
|
||||||
|
"zhan": ["dʒan"],
|
||||||
|
"zhang": ["dʒɑŋ"],
|
||||||
|
"zhao": ["dʒaʌ"],
|
||||||
|
"zhe": ["dʒø"],
|
||||||
|
# "zhei": ["dʒei"], it doesn't exist
|
||||||
|
"zhen": ["dʒœn"],
|
||||||
|
"zheng": ["dʒɵŋ"],
|
||||||
|
"zhi": ["dʒʏ"],
|
||||||
|
"zhong": ["dʒoŋ"],
|
||||||
|
"zhou": ["dʒou"],
|
||||||
|
"zhu": ["dʒu"],
|
||||||
|
"zhua": ["dʒua"],
|
||||||
|
"zhuai": ["dʒuai"],
|
||||||
|
"zhuan": ["dʒuan"],
|
||||||
|
"zhuang": ["dʒuɑŋ"],
|
||||||
|
"zhui": ["dʒuei"],
|
||||||
|
"zhun": ["dʒun"],
|
||||||
|
"zhuo": ["dʒuo"],
|
||||||
|
"zi": ["dzɪ"],
|
||||||
|
"zong": ["dzoŋ"],
|
||||||
|
"zou": ["dzou"],
|
||||||
|
"zu": ["dzu"],
|
||||||
|
"zuan": ["dzuan"],
|
||||||
|
"zui": ["dzuei"],
|
||||||
|
"zun": ["dzun"],
|
||||||
|
"zuo": ["dzuo"],
|
||||||
|
}
|
|
@ -219,6 +219,7 @@ def synthesis(model,
|
||||||
ap (TTS.tts.utils.audio.AudioProcessor): audio processor to process
|
ap (TTS.tts.utils.audio.AudioProcessor): audio processor to process
|
||||||
model outputs.
|
model outputs.
|
||||||
speaker_id (int): id of speaker
|
speaker_id (int): id of speaker
|
||||||
|
style_wav (str | Dict[str, float]): Uses for style embedding of GST.
|
||||||
style_wav (str): Uses for style embedding of GST.
|
style_wav (str): Uses for style embedding of GST.
|
||||||
truncated (bool): keep model states after inference. It can be used
|
truncated (bool): keep model states after inference. It can be used
|
||||||
for continuous inference at long texts.
|
for continuous inference at long texts.
|
||||||
|
|
|
@ -8,6 +8,7 @@ from phonemizer.phonemize import phonemize
|
||||||
from TTS.tts.utils.text import cleaners
|
from TTS.tts.utils.text import cleaners
|
||||||
from TTS.tts.utils.text.symbols import (_bos, _eos, _punctuations,
|
from TTS.tts.utils.text.symbols import (_bos, _eos, _punctuations,
|
||||||
make_symbols, phonemes, symbols)
|
make_symbols, phonemes, symbols)
|
||||||
|
from TTS.tts.utils.chinese_mandarin.phonemizer import chinese_text_to_phonemes
|
||||||
|
|
||||||
|
|
||||||
# pylint: disable=unnecessary-comprehension
|
# pylint: disable=unnecessary-comprehension
|
||||||
|
@ -29,8 +30,23 @@ PHONEME_PUNCTUATION_PATTERN = r'['+_punctuations.replace(' ', '')+']+'
|
||||||
|
|
||||||
def text2phone(text, language):
|
def text2phone(text, language):
|
||||||
'''
|
'''
|
||||||
Convert graphemes to phonemes.
|
Convert graphemes to phonemes. For most of the languages, it calls
|
||||||
|
the phonemizer python library that calls espeak/espeak-ng. For chinese
|
||||||
|
mandarin, it calls pypinyin + custom function for phonemizing
|
||||||
|
Parameters:
|
||||||
|
text (str): text to phonemize
|
||||||
|
language (str): language of the text
|
||||||
|
Returns:
|
||||||
|
ph (str): phonemes as a string seperated by "|"
|
||||||
|
ph = "ɪ|g|ˈ|z|æ|m|p|ə|l"
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
# TO REVIEW : How to have a good implementation for this?
|
||||||
|
if language == "chinese-mandarin":
|
||||||
|
ph = chinese_text_to_phonemes(text)
|
||||||
|
return ph
|
||||||
|
|
||||||
|
|
||||||
seperator = phonemizer.separator.Separator(' |', '', '|')
|
seperator = phonemizer.separator.Separator(' |', '', '|')
|
||||||
#try:
|
#try:
|
||||||
punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text)
|
punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text)
|
||||||
|
|
|
@ -15,6 +15,8 @@ from unidecode import unidecode
|
||||||
from .number_norm import normalize_numbers
|
from .number_norm import normalize_numbers
|
||||||
from .abbreviations import abbreviations_en, abbreviations_fr
|
from .abbreviations import abbreviations_en, abbreviations_fr
|
||||||
from .time import expand_time_english
|
from .time import expand_time_english
|
||||||
|
from TTS.tts.utils.chinese_mandarin.numbers import replace_numbers_to_characters_in_text
|
||||||
|
|
||||||
|
|
||||||
# Regular expression matching whitespace:
|
# Regular expression matching whitespace:
|
||||||
_whitespace_re = re.compile(r'\s+')
|
_whitespace_re = re.compile(r'\s+')
|
||||||
|
@ -122,6 +124,13 @@ def portuguese_cleaners(text):
|
||||||
text = collapse_whitespace(text)
|
text = collapse_whitespace(text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
def chinese_mandarin_cleaners(text: str) -> str:
|
||||||
|
'''Basic pipeline for chinese'''
|
||||||
|
text = replace_numbers_to_characters_in_text(text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def phoneme_cleaners(text):
|
def phoneme_cleaners(text):
|
||||||
'''Pipeline for phonemes mode, including number and abbreviation expansion.'''
|
'''Pipeline for phonemes mode, including number and abbreviation expansion.'''
|
||||||
text = expand_numbers(text)
|
text = expand_numbers(text)
|
||||||
|
|
|
@ -122,6 +122,13 @@ class Synthesizer(object):
|
||||||
speaker_embedding = self.init_speaker(speaker_idx)
|
speaker_embedding = self.init_speaker(speaker_idx)
|
||||||
use_gl = self.vocoder_model is None
|
use_gl = self.vocoder_model is None
|
||||||
|
|
||||||
|
|
||||||
|
# check if compute gst style
|
||||||
|
gst_style_input = None
|
||||||
|
if self.tts_config.use_gst:
|
||||||
|
if self.tts_config.gst["gst_style_input"] not in ["", {}]:
|
||||||
|
style_wav = self.tts_config.gst["gst_style_input"]
|
||||||
|
|
||||||
for sen in sens:
|
for sen in sens:
|
||||||
# synthesize voice
|
# synthesize voice
|
||||||
waveform, _, _, mel_postnet_spec, _, _ = synthesis(
|
waveform, _, _, mel_postnet_spec, _, _ = synthesis(
|
||||||
|
@ -131,7 +138,7 @@ class Synthesizer(object):
|
||||||
self.use_cuda,
|
self.use_cuda,
|
||||||
self.ap,
|
self.ap,
|
||||||
speaker_idx,
|
speaker_idx,
|
||||||
None,
|
gst_style_input,
|
||||||
False,
|
False,
|
||||||
self.tts_config.enable_eos_bos_chars,
|
self.tts_config.enable_eos_bos_chars,
|
||||||
use_gl,
|
use_gl,
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue