mirror of https://github.com/coqui-ai/TTS.git
34 lines
997 B
Python
34 lines
997 B
Python
import os
|
|
|
|
import torch
|
|
from tokenizers import Tokenizer
|
|
|
|
from TTS.tts.utils.text.cleaners import english_cleaners
|
|
|
|
|
|
class VoiceBpeTokenizer:
|
|
def __init__(self, vocab_file=None, vocab_str=None):
|
|
self.tokenizer = None
|
|
if vocab_file is not None:
|
|
self.tokenizer = Tokenizer.from_file(vocab_file)
|
|
if vocab_str is not None:
|
|
self.tokenizer = Tokenizer.from_str(vocab_str)
|
|
|
|
def preprocess_text(self, txt):
|
|
txt = english_cleaners(txt)
|
|
return txt
|
|
|
|
def encode(self, txt):
|
|
txt = self.preprocess_text(txt)
|
|
txt = txt.replace(" ", "[SPACE]")
|
|
return self.tokenizer.encode(txt).ids
|
|
|
|
def decode(self, seq):
|
|
if isinstance(seq, torch.Tensor):
|
|
seq = seq.cpu().numpy()
|
|
txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "")
|
|
txt = txt.replace("[SPACE]", " ")
|
|
txt = txt.replace("[STOP]", "")
|
|
txt = txt.replace("[UNK]", "")
|
|
return txt
|