Merge pull request #654 from kirianguiller/chinese-implementation

Chinese implementation (merge into dev)
This commit is contained in:
Eren Gölge 2021-02-18 17:15:32 +01:00 committed by GitHub
commit e4f81d6856
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 1182 additions and 3 deletions

View File

@ -75,6 +75,16 @@
"contact":"erengolge@gmail.com" "contact":"erengolge@gmail.com"
} }
} }
},
"zh":{
"baker":{
"tacotron2-DDC-GST":{
"model_file": "1RR9rZdV_FMm8yvtCHALtUbJf1nxbUiAw",
"config_file": "1daY1JHGXEozJ-MGYLiWEUmzEwEvM5xpz",
"stats_file": "1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV",
"commit": ""
}
}
} }
}, },
"vocoder_models":{ "vocoder_models":{

View File

@ -3,6 +3,7 @@ from glob import glob
import re import re
import sys import sys
from pathlib import Path from pathlib import Path
from typing import List, Tuple
from tqdm import tqdm from tqdm import tqdm
@ -368,3 +369,24 @@ def _voxcel_x(root_path, meta_file, voxcel_idx):
with open(str(cache_to), 'r') as f: with open(str(cache_to), 'r') as f:
return [x.strip().split('|') for x in f.readlines()] return [x.strip().split('|') for x in f.readlines()]
def baker(root_path: str, meta_file: str) -> List[List[str]]:
"""Normalizes the Baker meta data file to TTS format
Args:
root_path (str): path to the baker dataset
meta_file (str): name of the meta dataset containing names of wav to select and the transcript of the sentence
Returns:
List[List[str]]: List of (text, wav_path, speaker_name) associated with each sentences
"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "baker"
with open(txt_file, 'r') as ttf:
for line in ttf:
wav_name, text = line.rstrip('\n').split("|")
wav_path = os.path.join(root_path, "clips_22", wav_name)
items.append([text, wav_path, speaker_name])
return items

View File

@ -0,0 +1,131 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Licensed under WTFPL or the Unlicense or CC0.
# This uses Python 3, but it's easy to port to Python 2 by changing
# strings to u'xx'.
import re
import itertools
def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False) -> str:
"""Convert numerical arabic numbers (0->9) to chinese hanzi numbers ( -> 九)
Args:
num (str): arabic number to convert
big (bool, optional): use financial characters. Defaults to False.
simp (bool, optional): use simplified characters instead of tradictional characters. Defaults to True.
o (bool, optional): use for 'zero'. Defaults to False.
twoalt (bool, optional): use / for 'two' when appropriate. Defaults to False.
Raises:
ValueError: if number is more than 1e48
ValueError: if 'e' exposent in number
Returns:
str: converted number as hanzi characters
"""
# check num first
nd = str(num)
if abs(float(nd)) >= 1e48:
raise ValueError('number out of range')
elif 'e' in nd:
raise ValueError('scientific notation is not supported')
c_symbol = '正负点' if simp else '正負點'
if o: # formal
twoalt = False
if big:
c_basic = '零壹贰叁肆伍陆柒捌玖' if simp else '零壹貳參肆伍陸柒捌玖'
c_unit1 = '拾佰仟'
c_twoalt = '' if simp else ''
else:
c_basic = '〇一二三四五六七八九' if o else '零一二三四五六七八九'
c_unit1 = '十百千'
if twoalt:
c_twoalt = '' if simp else ''
else:
c_twoalt = ''
c_unit2 = '万亿兆京垓秭穰沟涧正载' if simp else '萬億兆京垓秭穰溝澗正載'
revuniq = lambda l: ''.join(k for k, g in itertools.groupby(reversed(l)))
nd = str(num)
result = []
if nd[0] == '+':
result.append(c_symbol[0])
elif nd[0] == '-':
result.append(c_symbol[1])
if '.' in nd:
integer, remainder = nd.lstrip('+-').split('.')
else:
integer, remainder = nd.lstrip('+-'), None
if int(integer):
splitted = [integer[max(i - 4, 0):i]
for i in range(len(integer), 0, -4)]
intresult = []
for nu, unit in enumerate(splitted):
# special cases
if int(unit) == 0: # 0000
intresult.append(c_basic[0])
continue
elif nu > 0 and int(unit) == 2: # 0002
intresult.append(c_twoalt + c_unit2[nu - 1])
continue
ulist = []
unit = unit.zfill(4)
for nc, ch in enumerate(reversed(unit)):
if ch == '0':
if ulist: # ???0
ulist.append(c_basic[0])
elif nc == 0:
ulist.append(c_basic[int(ch)])
elif nc == 1 and ch == '1' and unit[1] == '0':
# special case for tens
# edit the 'elif' if you don't like
# 十四, 三千零十四, 三千三百一十四
ulist.append(c_unit1[0])
elif nc > 1 and ch == '2':
ulist.append(c_twoalt + c_unit1[nc - 1])
else:
ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
ustr = revuniq(ulist)
if nu == 0:
intresult.append(ustr)
else:
intresult.append(ustr + c_unit2[nu - 1])
result.append(revuniq(intresult).strip(c_basic[0]))
else:
result.append(c_basic[0])
if remainder:
result.append(c_symbol[2])
result.append(''.join(c_basic[int(ch)] for ch in remainder))
return ''.join(result)
def _number_replace(match) -> str:
"""function to apply in a match, transform all numbers in a match by chinese characters
Args:
match (re.Match): numbers regex matches
Returns:
str: replaced characters for the numbers
"""
match_str: str = match.group()
return _num2chinese(match_str)
def replace_numbers_to_characters_in_text(text: str) -> str:
"""Replace all arabic numbers in a text by their equivalent in chinese characters (simplified)
Args:
text (str): input text to transform
Returns:
str: output text
"""
text = re.sub(r'[0-9]+', _number_replace, text)
return text

View File

@ -0,0 +1,41 @@
from typing import List
import pypinyin
from .pinyinToPhonemes import PINYIN_DICT
import jieba
def _chinese_character_to_pinyin(text: str) -> List[str]:
pinyins = pypinyin.pinyin(
text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True
)
pinyins_flat_list = [item for sublist in pinyins for item in sublist]
return pinyins_flat_list
def _chinese_pinyin_to_phoneme(pinyin: str) -> str:
segment = pinyin[:-1]
tone = pinyin[-1]
phoneme = PINYIN_DICT.get(segment, [""])[0]
return phoneme + tone
def chinese_text_to_phonemes(text: str) -> str:
tokenized_text = jieba.cut(text, HMM=False)
tokenized_text = " ".join(tokenized_text)
pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text)
results: List[str] = []
for token in pinyined_text:
if token[-1] in "12345": # TODO transform to is_pinyin()
pinyin_phonemes = _chinese_pinyin_to_phoneme(token)
results += list(pinyin_phonemes)
else: # is ponctuation or other
results += list(token)
return "|".join(results)

View File

@ -0,0 +1,420 @@
PINYIN_DICT = {
"a": ["a"],
"ai": ["ai"],
"an": ["an"],
"ang": ["ɑŋ"],
"ao": [""],
"ba": ["ba"],
"bai": ["bai"],
"ban": ["ban"],
"bang": ["bɑŋ"],
"bao": ["baʌ"],
# "be": ["be"], doesnt exist
"bei": ["bɛi"],
"ben": ["bœn"],
"beng": ["bɵŋ"],
"bi": ["bi"],
"bian": ["biɛn"],
"biao": ["biaʌ"],
"bie": ["bie"],
"bin": ["bin"],
"bing": ["bɨŋ"],
"bo": ["bo"],
"bu": ["bu"],
"ca": ["tsa"],
"cai": ["tsai"],
"can": ["tsan"],
"cang": ["tsɑŋ"],
"cao": ["tsaʌ"],
"ce": ["tsø"],
"cen": ["tsœn"],
"ceng": ["tsɵŋ"],
"cha": ["ʈʂa"],
"chai": ["ʈʂai"],
"chan": ["ʈʂan"],
"chang": ["ʈʂɑŋ"],
"chao": ["ʈʂaʌ"],
"che": ["ʈʂø"],
"chen": ["ʈʂœn"],
"cheng": ["ʈʂɵŋ"],
"chi": ["ʈʂʏ"],
"chong": ["ʈʂoŋ"],
"chou": ["ʈʂou"],
"chu": ["ʈʂu"],
"chua": ["ʈʂua"],
"chuai": ["ʈʂuai"],
"chuan": ["ʈʂuan"],
"chuang": ["ʈʂuɑŋ"],
"chui": ["ʈʂuei"],
"chun": ["ʈʂun"],
"chuo": ["ʈʂuo"],
"ci": ["tsɪ"],
"cong": ["tsoŋ"],
"cou": ["tsou"],
"cu": ["tsu"],
"cuan": ["tsuan"],
"cui": ["tsuei"],
"cun": ["tsun"],
"cuo": ["tsuo"],
"da": ["da"],
"dai": ["dai"],
"dan": ["dan"],
"dang": ["dɑŋ"],
"dao": ["daʌ"],
"de": [""],
"dei": ["dei"],
# "den": ["dœn"],
"deng": ["dɵŋ"],
"di": ["di"],
"dia": ["dia"],
"dian": ["diɛn"],
"diao": ["diaʌ"],
"die": ["die"],
"ding": ["dɨŋ"],
"diu": ["dio"],
"dong": ["doŋ"],
"dou": ["dou"],
"du": ["du"],
"duan": ["duan"],
"dui": ["duei"],
"dun": ["dun"],
"duo": ["duo"],
"e": ["ø"],
"ei": ["ei"],
"en": ["œn"],
# "ng": ["œn"],
# "eng": ["ɵŋ"],
"er": ["er"],
"fa": ["fa"],
"fan": ["fan"],
"fang": ["fɑŋ"],
"fei": ["fei"],
"fen": ["fœn"],
"feng": ["fɵŋ"],
"fo": ["fo"],
"fou": ["fou"],
"fu": ["fu"],
"ga": ["ga"],
"gai": ["gai"],
"gan": ["gan"],
"gang": ["gɑŋ"],
"gao": ["gaʌ"],
"ge": [""],
"gei": ["gei"],
"gen": ["gœn"],
"geng": ["gɵŋ"],
"gong": ["goŋ"],
"gou": ["gou"],
"gu": ["gu"],
"gua": ["gua"],
"guai": ["guai"],
"guan": ["guan"],
"guang": ["guɑŋ"],
"gui": ["guei"],
"gun": ["gun"],
"guo": ["guo"],
"ha": ["xa"],
"hai": ["xai"],
"han": ["xan"],
"hang": ["xɑŋ"],
"hao": ["xaʌ"],
"he": [""],
"hei": ["xei"],
"hen": ["xœn"],
"heng": ["xɵŋ"],
"hong": ["xoŋ"],
"hou": ["xou"],
"hu": ["xu"],
"hua": ["xua"],
"huai": ["xuai"],
"huan": ["xuan"],
"huang": ["xuɑŋ"],
"hui": ["xuei"],
"hun": ["xun"],
"huo": ["xuo"],
"ji": ["dʑi"],
"jia": ["dʑia"],
"jian": ["dʑiɛn"],
"jiang": ["dʑiɑŋ"],
"jiao": ["dʑiaʌ"],
"jie": ["dʑie"],
"jin": ["dʑin"],
"jing": ["dʑɨŋ"],
"jiong": ["dʑioŋ"],
"jiu": ["dʑio"],
"ju": ["dʑy"],
"juan": ["dʑyɛn"],
"jue": ["dʑye"],
"jun": ["dʑyn"],
"ka": ["ka"],
"kai": ["kai"],
"kan": ["kan"],
"kang": ["kɑŋ"],
"kao": ["kaʌ"],
"ke": [""],
"kei": ["kei"],
"ken": ["kœn"],
"keng": ["kɵŋ"],
"kong": ["koŋ"],
"kou": ["kou"],
"ku": ["ku"],
"kua": ["kua"],
"kuai": ["kuai"],
"kuan": ["kuan"],
"kuang": ["kuɑŋ"],
"kui": ["kuei"],
"kun": ["kun"],
"kuo": ["kuo"],
"la": ["la"],
"lai": ["lai"],
"lan": ["lan"],
"lang": ["lɑŋ"],
"lao": ["laʌ"],
"le": [""],
"lei": ["lei"],
"leng": ["lɵŋ"],
"li": ["li"],
"lia": ["lia"],
"lian": ["liɛn"],
"liang": ["liɑŋ"],
"liao": ["liaʌ"],
"lie": ["lie"],
"lin": ["lin"],
"ling": ["lɨŋ"],
"liu": ["lio"],
"lo": ["lo"],
"long": ["loŋ"],
"lou": ["lou"],
"lu": ["lu"],
"lv": ["ly"],
"luan": ["luan"],
"lve": ["lye"],
"lue": ["lue"],
"lun": ["lun"],
"luo": ["luo"],
"ma": ["ma"],
"mai": ["mai"],
"man": ["man"],
"mang": ["mɑŋ"],
"mao": ["maʌ"],
"me": [""],
"mei": ["mei"],
"men": ["mœn"],
"meng": ["mɵŋ"],
"mi": ["mi"],
"mian": ["miɛn"],
"miao": ["miaʌ"],
"mie": ["mie"],
"min": ["min"],
"ming": ["mɨŋ"],
"miu": ["mio"],
"mo": ["mo"],
"mou": ["mou"],
"mu": ["mu"],
"na": ["na"],
"nai": ["nai"],
"nan": ["nan"],
"nang": ["nɑŋ"],
"nao": ["naʌ"],
"ne": [""],
"nei": ["nei"],
"nen": ["nœn"],
"neng": ["nɵŋ"],
"ni": ["ni"],
"nia": ["nia"],
"nian": ["niɛn"],
"niang": ["niɑŋ"],
"niao": ["niaʌ"],
"nie": ["nie"],
"nin": ["nin"],
"ning": ["nɨŋ"],
"niu": ["nio"],
"nong": ["noŋ"],
"nou": ["nou"],
"nu": ["nu"],
"nv": ["ny"],
"nuan": ["nuan"],
"nve": ["nye"],
"nue": ["nye"],
"nuo": ["nuo"],
"o": ["o"],
"ou": ["ou"],
"pa": ["pa"],
"pai": ["pai"],
"pan": ["pan"],
"pang": ["pɑŋ"],
"pao": ["paʌ"],
"pe": [""],
"pei": ["pei"],
"pen": ["pœn"],
"peng": ["pɵŋ"],
"pi": ["pi"],
"pian": ["piɛn"],
"piao": ["piaʌ"],
"pie": ["pie"],
"pin": ["pin"],
"ping": ["pɨŋ"],
"po": ["po"],
"pou": ["pou"],
"pu": ["pu"],
"qi": ["tɕi"],
"qia": ["tɕia"],
"qian": ["tɕiɛn"],
"qiang": ["tɕiɑŋ"],
"qiao": ["tɕiaʌ"],
"qie": ["tɕie"],
"qin": ["tɕin"],
"qing": ["tɕɨŋ"],
"qiong": ["tɕioŋ"],
"qiu": ["tɕio"],
"qu": ["tɕy"],
"quan": ["tɕyɛn"],
"que": ["tɕye"],
"qun": ["tɕyn"],
"ran": ["ʐan"],
"rang": ["ʐɑŋ"],
"rao": ["ʐaʌ"],
"re": ["ʐø"],
"ren": ["ʐœn"],
"reng": ["ʐɵŋ"],
"ri": ["ʐʏ"],
"rong": ["ʐoŋ"],
"rou": ["ʐou"],
"ru": ["ʐu"],
"rua": ["ʐua"],
"ruan": ["ʐuan"],
"rui": ["ʐuei"],
"run": ["ʐun"],
"ruo": ["ʐuo"],
"sa": ["sa"],
"sai": ["sai"],
"san": ["san"],
"sang": ["sɑŋ"],
"sao": ["saʌ"],
"se": [""],
"sen": ["sœn"],
"seng": ["sɵŋ"],
"sha": ["ʂa"],
"shai": ["ʂai"],
"shan": ["ʂan"],
"shang": ["ʂɑŋ"],
"shao": ["ʂaʌ"],
"she": ["ʂø"],
"shei": ["ʂei"],
"shen": ["ʂœn"],
"sheng": ["ʂɵŋ"],
"shi": ["ʂʏ"],
"shou": ["ʂou"],
"shu": ["ʂu"],
"shua": ["ʂua"],
"shuai": ["ʂuai"],
"shuan": ["ʂuan"],
"shuang": ["ʂuɑŋ"],
"shui": ["ʂuei"],
"shun": ["ʂun"],
"shuo": ["ʂuo"],
"si": ["sɪ"],
"song": ["soŋ"],
"sou": ["sou"],
"su": ["su"],
"suan": ["suan"],
"sui": ["suei"],
"sun": ["sun"],
"suo": ["suo"],
"ta": ["ta"],
"tai": ["tai"],
"tan": ["tan"],
"tang": ["tɑŋ"],
"tao": ["taʌ"],
"te": [""],
"tei": ["tei"],
"teng": ["tɵŋ"],
"ti": ["ti"],
"tian": ["tiɛn"],
"tiao": ["tiaʌ"],
"tie": ["tie"],
"ting": ["tɨŋ"],
"tong": ["toŋ"],
"tou": ["tou"],
"tu": ["tu"],
"tuan": ["tuan"],
"tui": ["tuei"],
"tun": ["tun"],
"tuo": ["tuo"],
"wa": ["wa"],
"wai": ["wai"],
"wan": ["wan"],
"wang": ["wɑŋ"],
"wei": ["wei"],
"wen": ["wœn"],
"weng": ["wɵŋ"],
"wo": ["wo"],
"wu": ["wu"],
"xi": ["ɕi"],
"xia": ["ɕia"],
"xian": ["ɕiɛn"],
"xiang": ["ɕiɑŋ"],
"xiao": ["ɕiaʌ"],
"xie": ["ɕie"],
"xin": ["ɕin"],
"xing": ["ɕɨŋ"],
"xiong": ["ɕioŋ"],
"xiu": ["ɕio"],
"xu": ["ɕy"],
"xuan": ["ɕyɛn"],
"xue": ["ɕye"],
"xun": ["ɕyn"],
"ya": ["ia"],
"yan": ["iɛn"],
"yang": ["iɑŋ"],
"yao": ["iaʌ"],
"ye": ["ie"],
"yi": ["i"],
"yin": ["in"],
"ying": ["ɨŋ"],
"yo": ["io"],
"yong": ["ioŋ"],
"you": ["io"],
"yu": ["y"],
"yuan": ["yɛn"],
"yue": ["ye"],
"yun": ["yn"],
"za": ["dza"],
"zai": ["dzai"],
"zan": ["dzan"],
"zang": ["dzɑŋ"],
"zao": ["dzaʌ"],
"ze": ["dzø"],
"zei": ["dzei"],
"zen": ["dzœn"],
"zeng": ["dzɵŋ"],
"zha": ["dʒa"],
"zhai": ["dʒai"],
"zhan": ["dʒan"],
"zhang": ["ɑŋ"],
"zhao": ["dʒaʌ"],
"zhe": ["dʒø"],
# "zhei": ["dʒei"], it doesn't exist
"zhen": ["dʒœn"],
"zheng": ["dʒɵŋ"],
"zhi": ["ʏ"],
"zhong": ["dʒoŋ"],
"zhou": ["dʒou"],
"zhu": ["dʒu"],
"zhua": ["dʒua"],
"zhuai": ["dʒuai"],
"zhuan": ["dʒuan"],
"zhuang": ["dʒuɑŋ"],
"zhui": ["dʒuei"],
"zhun": ["dʒun"],
"zhuo": ["dʒuo"],
"zi": ["dzɪ"],
"zong": ["dzoŋ"],
"zou": ["dzou"],
"zu": ["dzu"],
"zuan": ["dzuan"],
"zui": ["dzuei"],
"zun": ["dzun"],
"zuo": ["dzuo"],
}

View File

@ -219,7 +219,7 @@ def synthesis(model,
ap (TTS.tts.utils.audio.AudioProcessor): audio processor to process ap (TTS.tts.utils.audio.AudioProcessor): audio processor to process
model outputs. model outputs.
speaker_id (int): id of speaker speaker_id (int): id of speaker
style_wav (str): Uses for style embedding of GST. style_wav (str | Dict[str, float]): Uses for style embedding of GST.
truncated (bool): keep model states after inference. It can be used truncated (bool): keep model states after inference. It can be used
for continuous inference at long texts. for continuous inference at long texts.
enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence. enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence.

View File

@ -8,6 +8,7 @@ from phonemizer.phonemize import phonemize
from TTS.tts.utils.text import cleaners from TTS.tts.utils.text import cleaners
from TTS.tts.utils.text.symbols import (_bos, _eos, _punctuations, from TTS.tts.utils.text.symbols import (_bos, _eos, _punctuations,
make_symbols, phonemes, symbols) make_symbols, phonemes, symbols)
from TTS.tts.utils.chinese_mandarin.phonemizer import chinese_text_to_phonemes
# pylint: disable=unnecessary-comprehension # pylint: disable=unnecessary-comprehension
@ -28,9 +29,23 @@ PHONEME_PUNCTUATION_PATTERN = r'['+_punctuations.replace(' ', '')+']+'
def text2phone(text, language): def text2phone(text, language):
'''Convert graphemes to phonemes. For most of the languages, it calls
the phonemizer python library that calls espeak/espeak-ng. For chinese
mandarin, it calls pypinyin + custom function for phonemizing
Parameters:
text (str): text to phonemize
language (str): language of the text
Returns:
ph (str): phonemes as a string seperated by "|"
ph = "ɪ|g|ˈ|z|æ|m|p|ə|l"
''' '''
Convert graphemes to phonemes.
''' # TO REVIEW : How to have a good implementation for this?
if language == "chinese-mandarin":
ph = chinese_text_to_phonemes(text)
return ph
seperator = phonemizer.separator.Separator(' |', '', '|') seperator = phonemizer.separator.Separator(' |', '', '|')
#try: #try:
punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text) punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text)

View File

@ -15,6 +15,8 @@ from unidecode import unidecode
from .number_norm import normalize_numbers from .number_norm import normalize_numbers
from .abbreviations import abbreviations_en, abbreviations_fr from .abbreviations import abbreviations_en, abbreviations_fr
from .time import expand_time_english from .time import expand_time_english
from TTS.tts.utils.chinese_mandarin.numbers import replace_numbers_to_characters_in_text
# Regular expression matching whitespace: # Regular expression matching whitespace:
_whitespace_re = re.compile(r'\s+') _whitespace_re = re.compile(r'\s+')
@ -122,6 +124,13 @@ def portuguese_cleaners(text):
text = collapse_whitespace(text) text = collapse_whitespace(text)
return text return text
def chinese_mandarin_cleaners(text: str) -> str:
'''Basic pipeline for chinese'''
text = replace_numbers_to_characters_in_text(text)
return text
def phoneme_cleaners(text): def phoneme_cleaners(text):
'''Pipeline for phonemes mode, including number and abbreviation expansion.''' '''Pipeline for phonemes mode, including number and abbreviation expansion.'''
text = expand_numbers(text) text = expand_numbers(text)

File diff suppressed because one or more lines are too long

View File

@ -6,6 +6,8 @@ numba==0.48
librosa==0.7.2 librosa==0.7.2
phonemizer>=2.2.0 phonemizer>=2.2.0
unidecode==0.4.20 unidecode==0.4.20
pypinyin
jieba
tensorboardX tensorboardX
matplotlib matplotlib
Pillow Pillow