Merge branch 'dev' of https://github.com/mozilla/TTS into dev

This commit is contained in:
Eren Gölge 2021-02-18 17:21:09 +00:00
commit adaeec57ec
11 changed files with 1182 additions and 3 deletions

View File

@ -75,6 +75,16 @@
"contact":"erengolge@gmail.com"
}
}
},
"zh":{
"baker":{
"tacotron2-DDC-GST":{
"model_file": "1RR9rZdV_FMm8yvtCHALtUbJf1nxbUiAw",
"config_file": "1daY1JHGXEozJ-MGYLiWEUmzEwEvM5xpz",
"stats_file": "1vl9c-D3dW_E7pdhNpDFQLX-giJc0jOtV",
"commit": ""
}
}
}
},
"vocoder_models":{

View File

@ -3,6 +3,7 @@ from glob import glob
import re
import sys
from pathlib import Path
from typing import List, Tuple
from tqdm import tqdm
@ -368,3 +369,24 @@ def _voxcel_x(root_path, meta_file, voxcel_idx):
with open(str(cache_to), 'r') as f:
return [x.strip().split('|') for x in f.readlines()]
def baker(root_path: str, meta_file: str) -> List[List[str]]:
"""Normalizes the Baker meta data file to TTS format
Args:
root_path (str): path to the baker dataset
meta_file (str): name of the meta dataset containing names of wav to select and the transcript of the sentence
Returns:
List[List[str]]: List of (text, wav_path, speaker_name) associated with each sentences
"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "baker"
with open(txt_file, 'r') as ttf:
for line in ttf:
wav_name, text = line.rstrip('\n').split("|")
wav_path = os.path.join(root_path, "clips_22", wav_name)
items.append([text, wav_path, speaker_name])
return items

View File

@ -0,0 +1,131 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Licensed under WTFPL or the Unlicense or CC0.
# This uses Python 3, but it's easy to port to Python 2 by changing
# strings to u'xx'.
import re
import itertools
def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False) -> str:
"""Convert numerical arabic numbers (0->9) to chinese hanzi numbers ( -> 九)
Args:
num (str): arabic number to convert
big (bool, optional): use financial characters. Defaults to False.
simp (bool, optional): use simplified characters instead of tradictional characters. Defaults to True.
o (bool, optional): use for 'zero'. Defaults to False.
twoalt (bool, optional): use / for 'two' when appropriate. Defaults to False.
Raises:
ValueError: if number is more than 1e48
ValueError: if 'e' exposent in number
Returns:
str: converted number as hanzi characters
"""
# check num first
nd = str(num)
if abs(float(nd)) >= 1e48:
raise ValueError('number out of range')
elif 'e' in nd:
raise ValueError('scientific notation is not supported')
c_symbol = '正负点' if simp else '正負點'
if o: # formal
twoalt = False
if big:
c_basic = '零壹贰叁肆伍陆柒捌玖' if simp else '零壹貳參肆伍陸柒捌玖'
c_unit1 = '拾佰仟'
c_twoalt = '' if simp else ''
else:
c_basic = '〇一二三四五六七八九' if o else '零一二三四五六七八九'
c_unit1 = '十百千'
if twoalt:
c_twoalt = '' if simp else ''
else:
c_twoalt = ''
c_unit2 = '万亿兆京垓秭穰沟涧正载' if simp else '萬億兆京垓秭穰溝澗正載'
revuniq = lambda l: ''.join(k for k, g in itertools.groupby(reversed(l)))
nd = str(num)
result = []
if nd[0] == '+':
result.append(c_symbol[0])
elif nd[0] == '-':
result.append(c_symbol[1])
if '.' in nd:
integer, remainder = nd.lstrip('+-').split('.')
else:
integer, remainder = nd.lstrip('+-'), None
if int(integer):
splitted = [integer[max(i - 4, 0):i]
for i in range(len(integer), 0, -4)]
intresult = []
for nu, unit in enumerate(splitted):
# special cases
if int(unit) == 0: # 0000
intresult.append(c_basic[0])
continue
elif nu > 0 and int(unit) == 2: # 0002
intresult.append(c_twoalt + c_unit2[nu - 1])
continue
ulist = []
unit = unit.zfill(4)
for nc, ch in enumerate(reversed(unit)):
if ch == '0':
if ulist: # ???0
ulist.append(c_basic[0])
elif nc == 0:
ulist.append(c_basic[int(ch)])
elif nc == 1 and ch == '1' and unit[1] == '0':
# special case for tens
# edit the 'elif' if you don't like
# 十四, 三千零十四, 三千三百一十四
ulist.append(c_unit1[0])
elif nc > 1 and ch == '2':
ulist.append(c_twoalt + c_unit1[nc - 1])
else:
ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
ustr = revuniq(ulist)
if nu == 0:
intresult.append(ustr)
else:
intresult.append(ustr + c_unit2[nu - 1])
result.append(revuniq(intresult).strip(c_basic[0]))
else:
result.append(c_basic[0])
if remainder:
result.append(c_symbol[2])
result.append(''.join(c_basic[int(ch)] for ch in remainder))
return ''.join(result)
def _number_replace(match) -> str:
"""function to apply in a match, transform all numbers in a match by chinese characters
Args:
match (re.Match): numbers regex matches
Returns:
str: replaced characters for the numbers
"""
match_str: str = match.group()
return _num2chinese(match_str)
def replace_numbers_to_characters_in_text(text: str) -> str:
"""Replace all arabic numbers in a text by their equivalent in chinese characters (simplified)
Args:
text (str): input text to transform
Returns:
str: output text
"""
text = re.sub(r'[0-9]+', _number_replace, text)
return text

View File

@ -0,0 +1,41 @@
from typing import List
import pypinyin
from .pinyinToPhonemes import PINYIN_DICT
import jieba
def _chinese_character_to_pinyin(text: str) -> List[str]:
pinyins = pypinyin.pinyin(
text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True
)
pinyins_flat_list = [item for sublist in pinyins for item in sublist]
return pinyins_flat_list
def _chinese_pinyin_to_phoneme(pinyin: str) -> str:
segment = pinyin[:-1]
tone = pinyin[-1]
phoneme = PINYIN_DICT.get(segment, [""])[0]
return phoneme + tone
def chinese_text_to_phonemes(text: str) -> str:
tokenized_text = jieba.cut(text, HMM=False)
tokenized_text = " ".join(tokenized_text)
pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text)
results: List[str] = []
for token in pinyined_text:
if token[-1] in "12345": # TODO transform to is_pinyin()
pinyin_phonemes = _chinese_pinyin_to_phoneme(token)
results += list(pinyin_phonemes)
else: # is ponctuation or other
results += list(token)
return "|".join(results)

View File

@ -0,0 +1,420 @@
PINYIN_DICT = {
"a": ["a"],
"ai": ["ai"],
"an": ["an"],
"ang": ["ɑŋ"],
"ao": [""],
"ba": ["ba"],
"bai": ["bai"],
"ban": ["ban"],
"bang": ["bɑŋ"],
"bao": ["baʌ"],
# "be": ["be"], doesnt exist
"bei": ["bɛi"],
"ben": ["bœn"],
"beng": ["bɵŋ"],
"bi": ["bi"],
"bian": ["biɛn"],
"biao": ["biaʌ"],
"bie": ["bie"],
"bin": ["bin"],
"bing": ["bɨŋ"],
"bo": ["bo"],
"bu": ["bu"],
"ca": ["tsa"],
"cai": ["tsai"],
"can": ["tsan"],
"cang": ["tsɑŋ"],
"cao": ["tsaʌ"],
"ce": ["tsø"],
"cen": ["tsœn"],
"ceng": ["tsɵŋ"],
"cha": ["ʈʂa"],
"chai": ["ʈʂai"],
"chan": ["ʈʂan"],
"chang": ["ʈʂɑŋ"],
"chao": ["ʈʂaʌ"],
"che": ["ʈʂø"],
"chen": ["ʈʂœn"],
"cheng": ["ʈʂɵŋ"],
"chi": ["ʈʂʏ"],
"chong": ["ʈʂoŋ"],
"chou": ["ʈʂou"],
"chu": ["ʈʂu"],
"chua": ["ʈʂua"],
"chuai": ["ʈʂuai"],
"chuan": ["ʈʂuan"],
"chuang": ["ʈʂuɑŋ"],
"chui": ["ʈʂuei"],
"chun": ["ʈʂun"],
"chuo": ["ʈʂuo"],
"ci": ["tsɪ"],
"cong": ["tsoŋ"],
"cou": ["tsou"],
"cu": ["tsu"],
"cuan": ["tsuan"],
"cui": ["tsuei"],
"cun": ["tsun"],
"cuo": ["tsuo"],
"da": ["da"],
"dai": ["dai"],
"dan": ["dan"],
"dang": ["dɑŋ"],
"dao": ["daʌ"],
"de": [""],
"dei": ["dei"],
# "den": ["dœn"],
"deng": ["dɵŋ"],
"di": ["di"],
"dia": ["dia"],
"dian": ["diɛn"],
"diao": ["diaʌ"],
"die": ["die"],
"ding": ["dɨŋ"],
"diu": ["dio"],
"dong": ["doŋ"],
"dou": ["dou"],
"du": ["du"],
"duan": ["duan"],
"dui": ["duei"],
"dun": ["dun"],
"duo": ["duo"],
"e": ["ø"],
"ei": ["ei"],
"en": ["œn"],
# "ng": ["œn"],
# "eng": ["ɵŋ"],
"er": ["er"],
"fa": ["fa"],
"fan": ["fan"],
"fang": ["fɑŋ"],
"fei": ["fei"],
"fen": ["fœn"],
"feng": ["fɵŋ"],
"fo": ["fo"],
"fou": ["fou"],
"fu": ["fu"],
"ga": ["ga"],
"gai": ["gai"],
"gan": ["gan"],
"gang": ["gɑŋ"],
"gao": ["gaʌ"],
"ge": [""],
"gei": ["gei"],
"gen": ["gœn"],
"geng": ["gɵŋ"],
"gong": ["goŋ"],
"gou": ["gou"],
"gu": ["gu"],
"gua": ["gua"],
"guai": ["guai"],
"guan": ["guan"],
"guang": ["guɑŋ"],
"gui": ["guei"],
"gun": ["gun"],
"guo": ["guo"],
"ha": ["xa"],
"hai": ["xai"],
"han": ["xan"],
"hang": ["xɑŋ"],
"hao": ["xaʌ"],
"he": [""],
"hei": ["xei"],
"hen": ["xœn"],
"heng": ["xɵŋ"],
"hong": ["xoŋ"],
"hou": ["xou"],
"hu": ["xu"],
"hua": ["xua"],
"huai": ["xuai"],
"huan": ["xuan"],
"huang": ["xuɑŋ"],
"hui": ["xuei"],
"hun": ["xun"],
"huo": ["xuo"],
"ji": ["dʑi"],
"jia": ["dʑia"],
"jian": ["dʑiɛn"],
"jiang": ["dʑiɑŋ"],
"jiao": ["dʑiaʌ"],
"jie": ["dʑie"],
"jin": ["dʑin"],
"jing": ["dʑɨŋ"],
"jiong": ["dʑioŋ"],
"jiu": ["dʑio"],
"ju": ["dʑy"],
"juan": ["dʑyɛn"],
"jue": ["dʑye"],
"jun": ["dʑyn"],
"ka": ["ka"],
"kai": ["kai"],
"kan": ["kan"],
"kang": ["kɑŋ"],
"kao": ["kaʌ"],
"ke": [""],
"kei": ["kei"],
"ken": ["kœn"],
"keng": ["kɵŋ"],
"kong": ["koŋ"],
"kou": ["kou"],
"ku": ["ku"],
"kua": ["kua"],
"kuai": ["kuai"],
"kuan": ["kuan"],
"kuang": ["kuɑŋ"],
"kui": ["kuei"],
"kun": ["kun"],
"kuo": ["kuo"],
"la": ["la"],
"lai": ["lai"],
"lan": ["lan"],
"lang": ["lɑŋ"],
"lao": ["laʌ"],
"le": [""],
"lei": ["lei"],
"leng": ["lɵŋ"],
"li": ["li"],
"lia": ["lia"],
"lian": ["liɛn"],
"liang": ["liɑŋ"],
"liao": ["liaʌ"],
"lie": ["lie"],
"lin": ["lin"],
"ling": ["lɨŋ"],
"liu": ["lio"],
"lo": ["lo"],
"long": ["loŋ"],
"lou": ["lou"],
"lu": ["lu"],
"lv": ["ly"],
"luan": ["luan"],
"lve": ["lye"],
"lue": ["lue"],
"lun": ["lun"],
"luo": ["luo"],
"ma": ["ma"],
"mai": ["mai"],
"man": ["man"],
"mang": ["mɑŋ"],
"mao": ["maʌ"],
"me": [""],
"mei": ["mei"],
"men": ["mœn"],
"meng": ["mɵŋ"],
"mi": ["mi"],
"mian": ["miɛn"],
"miao": ["miaʌ"],
"mie": ["mie"],
"min": ["min"],
"ming": ["mɨŋ"],
"miu": ["mio"],
"mo": ["mo"],
"mou": ["mou"],
"mu": ["mu"],
"na": ["na"],
"nai": ["nai"],
"nan": ["nan"],
"nang": ["nɑŋ"],
"nao": ["naʌ"],
"ne": [""],
"nei": ["nei"],
"nen": ["nœn"],
"neng": ["nɵŋ"],
"ni": ["ni"],
"nia": ["nia"],
"nian": ["niɛn"],
"niang": ["niɑŋ"],
"niao": ["niaʌ"],
"nie": ["nie"],
"nin": ["nin"],
"ning": ["nɨŋ"],
"niu": ["nio"],
"nong": ["noŋ"],
"nou": ["nou"],
"nu": ["nu"],
"nv": ["ny"],
"nuan": ["nuan"],
"nve": ["nye"],
"nue": ["nye"],
"nuo": ["nuo"],
"o": ["o"],
"ou": ["ou"],
"pa": ["pa"],
"pai": ["pai"],
"pan": ["pan"],
"pang": ["pɑŋ"],
"pao": ["paʌ"],
"pe": [""],
"pei": ["pei"],
"pen": ["pœn"],
"peng": ["pɵŋ"],
"pi": ["pi"],
"pian": ["piɛn"],
"piao": ["piaʌ"],
"pie": ["pie"],
"pin": ["pin"],
"ping": ["pɨŋ"],
"po": ["po"],
"pou": ["pou"],
"pu": ["pu"],
"qi": ["tɕi"],
"qia": ["tɕia"],
"qian": ["tɕiɛn"],
"qiang": ["tɕiɑŋ"],
"qiao": ["tɕiaʌ"],
"qie": ["tɕie"],
"qin": ["tɕin"],
"qing": ["tɕɨŋ"],
"qiong": ["tɕioŋ"],
"qiu": ["tɕio"],
"qu": ["tɕy"],
"quan": ["tɕyɛn"],
"que": ["tɕye"],
"qun": ["tɕyn"],
"ran": ["ʐan"],
"rang": ["ʐɑŋ"],
"rao": ["ʐaʌ"],
"re": ["ʐø"],
"ren": ["ʐœn"],
"reng": ["ʐɵŋ"],
"ri": ["ʐʏ"],
"rong": ["ʐoŋ"],
"rou": ["ʐou"],
"ru": ["ʐu"],
"rua": ["ʐua"],
"ruan": ["ʐuan"],
"rui": ["ʐuei"],
"run": ["ʐun"],
"ruo": ["ʐuo"],
"sa": ["sa"],
"sai": ["sai"],
"san": ["san"],
"sang": ["sɑŋ"],
"sao": ["saʌ"],
"se": [""],
"sen": ["sœn"],
"seng": ["sɵŋ"],
"sha": ["ʂa"],
"shai": ["ʂai"],
"shan": ["ʂan"],
"shang": ["ʂɑŋ"],
"shao": ["ʂaʌ"],
"she": ["ʂø"],
"shei": ["ʂei"],
"shen": ["ʂœn"],
"sheng": ["ʂɵŋ"],
"shi": ["ʂʏ"],
"shou": ["ʂou"],
"shu": ["ʂu"],
"shua": ["ʂua"],
"shuai": ["ʂuai"],
"shuan": ["ʂuan"],
"shuang": ["ʂuɑŋ"],
"shui": ["ʂuei"],
"shun": ["ʂun"],
"shuo": ["ʂuo"],
"si": ["sɪ"],
"song": ["soŋ"],
"sou": ["sou"],
"su": ["su"],
"suan": ["suan"],
"sui": ["suei"],
"sun": ["sun"],
"suo": ["suo"],
"ta": ["ta"],
"tai": ["tai"],
"tan": ["tan"],
"tang": ["tɑŋ"],
"tao": ["taʌ"],
"te": [""],
"tei": ["tei"],
"teng": ["tɵŋ"],
"ti": ["ti"],
"tian": ["tiɛn"],
"tiao": ["tiaʌ"],
"tie": ["tie"],
"ting": ["tɨŋ"],
"tong": ["toŋ"],
"tou": ["tou"],
"tu": ["tu"],
"tuan": ["tuan"],
"tui": ["tuei"],
"tun": ["tun"],
"tuo": ["tuo"],
"wa": ["wa"],
"wai": ["wai"],
"wan": ["wan"],
"wang": ["wɑŋ"],
"wei": ["wei"],
"wen": ["wœn"],
"weng": ["wɵŋ"],
"wo": ["wo"],
"wu": ["wu"],
"xi": ["ɕi"],
"xia": ["ɕia"],
"xian": ["ɕiɛn"],
"xiang": ["ɕiɑŋ"],
"xiao": ["ɕiaʌ"],
"xie": ["ɕie"],
"xin": ["ɕin"],
"xing": ["ɕɨŋ"],
"xiong": ["ɕioŋ"],
"xiu": ["ɕio"],
"xu": ["ɕy"],
"xuan": ["ɕyɛn"],
"xue": ["ɕye"],
"xun": ["ɕyn"],
"ya": ["ia"],
"yan": ["iɛn"],
"yang": ["iɑŋ"],
"yao": ["iaʌ"],
"ye": ["ie"],
"yi": ["i"],
"yin": ["in"],
"ying": ["ɨŋ"],
"yo": ["io"],
"yong": ["ioŋ"],
"you": ["io"],
"yu": ["y"],
"yuan": ["yɛn"],
"yue": ["ye"],
"yun": ["yn"],
"za": ["dza"],
"zai": ["dzai"],
"zan": ["dzan"],
"zang": ["dzɑŋ"],
"zao": ["dzaʌ"],
"ze": ["dzø"],
"zei": ["dzei"],
"zen": ["dzœn"],
"zeng": ["dzɵŋ"],
"zha": ["dʒa"],
"zhai": ["dʒai"],
"zhan": ["dʒan"],
"zhang": ["ɑŋ"],
"zhao": ["dʒaʌ"],
"zhe": ["dʒø"],
# "zhei": ["dʒei"], it doesn't exist
"zhen": ["dʒœn"],
"zheng": ["dʒɵŋ"],
"zhi": ["ʏ"],
"zhong": ["dʒoŋ"],
"zhou": ["dʒou"],
"zhu": ["dʒu"],
"zhua": ["dʒua"],
"zhuai": ["dʒuai"],
"zhuan": ["dʒuan"],
"zhuang": ["dʒuɑŋ"],
"zhui": ["dʒuei"],
"zhun": ["dʒun"],
"zhuo": ["dʒuo"],
"zi": ["dzɪ"],
"zong": ["dzoŋ"],
"zou": ["dzou"],
"zu": ["dzu"],
"zuan": ["dzuan"],
"zui": ["dzuei"],
"zun": ["dzun"],
"zuo": ["dzuo"],
}

View File

@ -219,7 +219,7 @@ def synthesis(model,
ap (TTS.tts.utils.audio.AudioProcessor): audio processor to process
model outputs.
speaker_id (int): id of speaker
style_wav (str): Uses for style embedding of GST.
style_wav (str | Dict[str, float]): Uses for style embedding of GST.
truncated (bool): keep model states after inference. It can be used
for continuous inference at long texts.
enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence.

View File

@ -8,6 +8,7 @@ from phonemizer.phonemize import phonemize
from TTS.tts.utils.text import cleaners
from TTS.tts.utils.text.symbols import (_bos, _eos, _punctuations,
make_symbols, phonemes, symbols)
from TTS.tts.utils.chinese_mandarin.phonemizer import chinese_text_to_phonemes
# pylint: disable=unnecessary-comprehension
@ -28,9 +29,23 @@ PHONEME_PUNCTUATION_PATTERN = r'['+_punctuations.replace(' ', '')+']+'
def text2phone(text, language):
'''Convert graphemes to phonemes. For most of the languages, it calls
the phonemizer python library that calls espeak/espeak-ng. For chinese
mandarin, it calls pypinyin + custom function for phonemizing
Parameters:
text (str): text to phonemize
language (str): language of the text
Returns:
ph (str): phonemes as a string seperated by "|"
ph = "ɪ|g|ˈ|z|æ|m|p|ə|l"
'''
Convert graphemes to phonemes.
'''
# TO REVIEW : How to have a good implementation for this?
if language == "chinese-mandarin":
ph = chinese_text_to_phonemes(text)
return ph
seperator = phonemizer.separator.Separator(' |', '', '|')
#try:
punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text)

View File

@ -15,6 +15,8 @@ from unidecode import unidecode
from .number_norm import normalize_numbers
from .abbreviations import abbreviations_en, abbreviations_fr
from .time import expand_time_english
from TTS.tts.utils.chinese_mandarin.numbers import replace_numbers_to_characters_in_text
# Regular expression matching whitespace:
_whitespace_re = re.compile(r'\s+')
@ -122,6 +124,13 @@ def portuguese_cleaners(text):
text = collapse_whitespace(text)
return text
def chinese_mandarin_cleaners(text: str) -> str:
'''Basic pipeline for chinese'''
text = replace_numbers_to_characters_in_text(text)
return text
def phoneme_cleaners(text):
'''Pipeline for phonemes mode, including number and abbreviation expansion.'''
text = expand_numbers(text)

File diff suppressed because one or more lines are too long

View File

@ -6,6 +6,8 @@ numba==0.48
librosa==0.7.2
phonemizer>=2.2.0
unidecode==0.4.20
pypinyin
jieba
tensorboardX
matplotlib
Pillow