Merge pull request #1 from mozilla/dev

Sync with Mozilla TTS dev branch
This commit is contained in:
Thorsten Müller 2020-12-19 22:10:46 +01:00 committed by GitHub
commit 8c81ca7bfd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 140 additions and 28 deletions

View File

@ -13,10 +13,9 @@ from TTS.speaker_encoder.dataset import MyDataset
from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss
from TTS.speaker_encoder.model import SpeakerEncoder from TTS.speaker_encoder.model import SpeakerEncoder
from TTS.speaker_encoder.utils.generic_utils import \ from TTS.speaker_encoder.utils.generic_utils import \
check_config_speaker_encoder check_config_speaker_encoder, save_best_model
from TTS.speaker_encoder.utils.visual import plot_embeddings from TTS.speaker_encoder.utils.visual import plot_embeddings
from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.preprocess import load_meta_data
from TTS.tts.utils.io import save_best_model
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import (count_parameters, from TTS.utils.generic_utils import (count_parameters,
create_experiment_folder, get_git_branch, create_experiment_folder, get_git_branch,

View File

@ -57,6 +57,7 @@ def setup_loader(ap, r, is_val=False, verbose=False):
use_phonemes=c.use_phonemes, use_phonemes=c.use_phonemes,
phoneme_language=c.phoneme_language, phoneme_language=c.phoneme_language,
enable_eos_bos=c.enable_eos_bos_chars, enable_eos_bos=c.enable_eos_bos_chars,
use_noise_augment=not is_val,
verbose=verbose, verbose=verbose,
speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None) speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None)
@ -279,7 +280,12 @@ def train(data_loader, model, criterion, optimizer, scheduler,
# Diagnostic visualizations # Diagnostic visualizations
# direct pass on model for spec predictions # direct pass on model for spec predictions
target_speaker = None if speaker_c is None else speaker_c[:1] target_speaker = None if speaker_c is None else speaker_c[:1]
spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1], g=target_speaker)
if hasattr(model, 'module'):
spec_pred, *_ = model.module.inference(text_input[:1], text_lengths[:1], g=target_speaker)
else:
spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1], g=target_speaker)
spec_pred = spec_pred.permute(0, 2, 1) spec_pred = spec_pred.permute(0, 2, 1)
gt_spec = mel_input.permute(0, 2, 1) gt_spec = mel_input.permute(0, 2, 1)
const_spec = spec_pred[0].data.cpu().numpy() const_spec = spec_pred[0].data.cpu().numpy()

View File

@ -30,6 +30,7 @@ class MyDataset(Dataset):
phoneme_language="en-us", phoneme_language="en-us",
enable_eos_bos=False, enable_eos_bos=False,
speaker_mapping=None, speaker_mapping=None,
use_noise_augment=False,
verbose=False): verbose=False):
""" """
Args: Args:
@ -48,6 +49,7 @@ class MyDataset(Dataset):
phoneme_language (str): one the languages from phoneme_language (str): one the languages from
https://github.com/bootphon/phonemizer#languages https://github.com/bootphon/phonemizer#languages
enable_eos_bos (bool): enable end of sentence and beginning of sentences characters. enable_eos_bos (bool): enable end of sentence and beginning of sentences characters.
use_noise_augment (bool): enable adding random noise to wav for augmentation.
verbose (bool): print diagnostic information. verbose (bool): print diagnostic information.
""" """
self.batch_group_size = batch_group_size self.batch_group_size = batch_group_size
@ -66,6 +68,7 @@ class MyDataset(Dataset):
self.phoneme_language = phoneme_language self.phoneme_language = phoneme_language
self.enable_eos_bos = enable_eos_bos self.enable_eos_bos = enable_eos_bos
self.speaker_mapping = speaker_mapping self.speaker_mapping = speaker_mapping
self.use_noise_augment = use_noise_augment
self.verbose = verbose self.verbose = verbose
self.input_seq_computed = False self.input_seq_computed = False
if use_phonemes and not os.path.isdir(phoneme_cache_path): if use_phonemes and not os.path.isdir(phoneme_cache_path):
@ -134,6 +137,10 @@ class MyDataset(Dataset):
wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
# apply noise for augmentation
if self.use_noise_augment:
wav = wav + (1.0 / 32768.0) * np.random.rand(*wav.shape)
if not self.input_seq_computed: if not self.input_seq_computed:
if self.use_phonemes: if self.use_phonemes:
text = self._load_or_generate_phoneme_sequence(wav_file, text, self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, self.phoneme_language, self.tp, self.add_blank) text = self._load_or_generate_phoneme_sequence(wav_file, text, self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, self.phoneme_language, self.tp, self.add_blank)

View File

@ -62,7 +62,11 @@ def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel
inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings) inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings)
elif 'glow' in CONFIG.model.lower(): elif 'glow' in CONFIG.model.lower():
inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable
postnet_output, _, _, _, alignments, _, _ = model.inference(inputs, inputs_lengths, g=speaker_id if speaker_id else speaker_embeddings) if hasattr(model, 'module'):
# distributed model
postnet_output, _, _, _, alignments, _, _ = model.module.inference(inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings)
else:
postnet_output, _, _, _, alignments, _, _ = model.inference(inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings)
postnet_output = postnet_output.permute(0, 2, 1) postnet_output = postnet_output.permute(0, 2, 1)
# these only belong to tacotron models. # these only belong to tacotron models.
decoder_output = None decoder_output = None

View File

@ -14,6 +14,7 @@ import re
from unidecode import unidecode from unidecode import unidecode
from .number_norm import normalize_numbers from .number_norm import normalize_numbers
from .abbreviations import abbreviations_en, abbreviations_fr from .abbreviations import abbreviations_en, abbreviations_fr
from .time import expand_time_english
# Regular expression matching whitespace: # Regular expression matching whitespace:
_whitespace_re = re.compile(r'\s+') _whitespace_re = re.compile(r'\s+')
@ -95,6 +96,7 @@ def english_cleaners(text):
'''Pipeline for English text, including number and abbreviation expansion.''' '''Pipeline for English text, including number and abbreviation expansion.'''
text = convert_to_ascii(text) text = convert_to_ascii(text)
text = lowercase(text) text = lowercase(text)
text = expand_time_english(text)
text = expand_numbers(text) text = expand_numbers(text)
text = expand_abbreviations(text) text = expand_abbreviations(text)
text = replace_symbols(text) text = replace_symbols(text)
@ -122,8 +124,8 @@ def portuguese_cleaners(text):
def phoneme_cleaners(text): def phoneme_cleaners(text):
'''Pipeline for phonemes mode, including number and abbreviation expansion.''' '''Pipeline for phonemes mode, including number and abbreviation expansion.'''
text = convert_to_ascii(text)
text = expand_numbers(text) text = expand_numbers(text)
text = convert_to_ascii(text)
text = expand_abbreviations(text) text = expand_abbreviations(text)
text = replace_symbols(text) text = replace_symbols(text)
text = remove_aux_symbols(text) text = remove_aux_symbols(text)

View File

@ -2,14 +2,14 @@
import inflect import inflect
import re import re
from typing import Dict
_inflect = inflect.engine() _inflect = inflect.engine()
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') _currency_re = re.compile(r'(£|\$|¥)([0-9\,\.]*[0-9]+)')
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
_number_re = re.compile(r'[0-9]+') _number_re = re.compile(r'-?[0-9]+')
def _remove_commas(m): def _remove_commas(m):
@ -20,24 +20,54 @@ def _expand_decimal_point(m):
return m.group(1).replace('.', ' point ') return m.group(1).replace('.', ' point ')
def _expand_dollars(m): def __expand_currency(value: str, inflection: Dict[float, str]) -> str:
match = m.group(1) parts = value.replace(",", "").split('.')
parts = match.split('.')
if len(parts) > 2: if len(parts) > 2:
return match + ' dollars' # Unexpected format return f"{value} {inflection[2]}" # Unexpected format
dollars = int(parts[0]) if parts[0] else 0 text = []
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 integer = int(parts[0]) if parts[0] else 0
if dollars and cents: if integer > 0:
dollar_unit = 'dollar' if dollars == 1 else 'dollars' integer_unit = inflection.get(integer, inflection[2])
cent_unit = 'cent' if cents == 1 else 'cents' text.append(f"{integer} {integer_unit}")
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) fraction = int(parts[1]) if len(parts) > 1 and parts[1] else 0
if dollars: if fraction > 0:
dollar_unit = 'dollar' if dollars == 1 else 'dollars' fraction_unit = inflection.get(fraction/100, inflection[0.02])
return '%s %s' % (dollars, dollar_unit) text.append(f"{fraction} {fraction_unit}")
if cents: if len(text) == 0:
cent_unit = 'cent' if cents == 1 else 'cents' return f"zero {inflection[2]}"
return '%s %s' % (cents, cent_unit) return " ".join(text)
return 'zero dollars'
def _expand_currency(m: "re.Match") -> str:
currencies = {
"$": {
0.01: "cent",
0.02: "cents",
1: "dollar",
2: "dollars",
},
"": {
0.01: "cent",
0.02: "cents",
1: "euro",
2: "euros",
},
"£": {
0.01: "penny",
0.02: "pence",
1: "pound sterling",
2: "pounds sterling",
},
"¥": {
# TODO rin
0.02: "sen",
2: "yen",
}
}
unit = m.group(1)
currency = currencies[unit]
value = m.group(2)
return __expand_currency(value, currency)
def _expand_ordinal(m): def _expand_ordinal(m):
@ -62,8 +92,7 @@ def _expand_number(m):
def normalize_numbers(text): def normalize_numbers(text):
text = re.sub(_comma_number_re, _remove_commas, text) text = re.sub(_comma_number_re, _remove_commas, text)
text = re.sub(_pounds_re, r'\1 pounds', text) text = re.sub(_currency_re, _expand_currency, text)
text = re.sub(_dollars_re, _expand_dollars, text)
text = re.sub(_decimal_number_re, _expand_decimal_point, text) text = re.sub(_decimal_number_re, _expand_decimal_point, text)
text = re.sub(_ordinal_re, _expand_ordinal, text) text = re.sub(_ordinal_re, _expand_ordinal, text)
text = re.sub(_number_re, _expand_number, text) text = re.sub(_number_re, _expand_number, text)

View File

@ -0,0 +1,44 @@
import re
import inflect
_inflect = inflect.engine()
_time_re = re.compile(r"""\b
((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3])) # hours
:
([0-5][0-9]) # minutes
\s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm
\b""",
re.IGNORECASE | re.X)
def _expand_num(n: int) -> str:
return _inflect.number_to_words(n)
def _expand_time_english(match: "re.Match") -> str:
hour = int(match.group(1))
past_noon = hour >= 12
time = []
if hour > 12:
hour -= 12
elif hour == 0:
hour = 12
past_noon = True
time.append(_expand_num(hour))
minute = int(match.group(6))
if minute > 0:
if minute < 10:
time.append("oh")
time.append(_expand_num(minute))
am_pm = match.group(7)
if am_pm is None:
time.append("p m" if past_noon else "a m")
else:
time.extend(list(am_pm.replace(".", "")))
return " ".join(time)
def expand_time_english(text: str) -> str:
return re.sub(_time_re, _expand_time_english, text)

View File

@ -0,0 +1,21 @@
#!/usr/bin/env python3
from TTS.tts.utils.text.cleaners import english_cleaners, phoneme_cleaners
def test_time() -> None:
assert english_cleaners("It's 11:00") == "it's eleven a m"
assert english_cleaners("It's 9:01") == "it's nine oh one a m"
assert english_cleaners("It's 16:00") == "it's four p m"
assert english_cleaners("It's 00:00 am") == "it's twelve a m"
def test_currency() -> None:
assert phoneme_cleaners("It's $10.50") == "It's ten dollars fifty cents"
assert phoneme_cleaners("£1.1") == "one pound sterling one penny"
assert phoneme_cleaners("¥1") == "one yen"
def test_expand_numbers() -> None:
assert "minus one" == phoneme_cleaners("-1")
assert "one" == phoneme_cleaners("1")

View File

@ -171,4 +171,4 @@ def test_text2phone():
gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!" gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!"
lang = "en-us" lang = "en-us"
ph = text2phone(text, lang) ph = text2phone(text, lang)
assert gt == ph, f"\n{phonemes} \n vs \n{gt}" assert gt == ph