diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml new file mode 100644 index 00000000..d31e71cf --- /dev/null +++ b/.github/workflows/pypi-release.yml @@ -0,0 +1,38 @@ +name: Publish Python 🐍 distributions 📦 to PyPI +on: + release: + types: [published] +defaults: + run: + shell: + bash +jobs: + build-package: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + - name: Verify tag matches version + run: | + set -ex + version=$(cat TTS/VERSION) + tag="${GITHUB_REF/refs\/tags\/}" + if [[ "v$version" != "$tag" ]]; then + exit 1 + fi + - uses: actions/setup-python@v2 + with: + python-version: 3.8 + - run: | + python -m pip install -U pip setuptools twine toml + python -c 'import toml; c = toml.load("pyproject.toml"); print("\n".join(c["build-system"]["requires"]))' | pip install -r /dev/stdin + - run: | + python setup.py sdist + - name: Setup PyPI config + run: | + cat << EOF > ~/.pypirc + [pypi] + username=__token__ + password=${{ secrets.PYPI_TOKEN }} + EOF + - run: | + twine upload --repository pypi dist/*.tar.gz diff --git a/.pylintrc b/.pylintrc index 0bc0be4b..34c121eb 100644 --- a/.pylintrc +++ b/.pylintrc @@ -158,7 +158,8 @@ disable=missing-docstring, deprecated-sys-function, exception-escape, comprehension-escape, - duplicate-code + duplicate-code, + not-callable # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option @@ -253,7 +254,7 @@ contextmanager-decorators=contextlib.contextmanager # List of members which are set dynamically and missed by pylint inference # system, and so shouldn't trigger E1101 when accessed. Python regular # expressions are accepted. -generated-members= +generated-members=numpy.*,torch.* # Tells whether missing members accessed in mixin class should be ignored. A # mixin class is detected if its name ends with "mixin" (case insensitive). diff --git a/MANIFEST.in b/MANIFEST.in index 664295c7..861cb5a7 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,7 @@ include README.md include LICENSE.txt include requirements.*.txt +include TTS/VERSION recursive-include TTS *.json recursive-include TTS *.html recursive-include TTS *.png diff --git a/TTS/.models.json b/TTS/.models.json index b926f120..310dc5f0 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -149,6 +149,18 @@ "needs_phonemizer": true } } + }, + "ja":{ + "kokoro":{ + "tacotron2-DDC":{ + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.15/tts_models--jp--kokoro--tacotron2-DDC.zip", + "default_vocoder": "vocoder_models/universal/libri-tts/wavegrad", + "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.", + "author": "@kaiidams", + "commit": "401fbd89", + "needs_phonemizer": false + } + } } }, "vocoder_models":{ diff --git a/TTS/VERSION b/TTS/VERSION new file mode 100644 index 00000000..ceddfb28 --- /dev/null +++ b/TTS/VERSION @@ -0,0 +1 @@ +0.0.15 diff --git a/TTS/__init__.py b/TTS/__init__.py index 8dee4bf8..da35faf8 100644 --- a/TTS/__init__.py +++ b/TTS/__init__.py @@ -1 +1,7 @@ -from ._version import __version__ +import os + + +with open(os.path.join(os.path.dirname(__file__), "VERSION")) as f: + version = f.read().strip() + +__version__ = version diff --git a/TTS/_version.py b/TTS/_version.py deleted file mode 100644 index 311f216e..00000000 --- a/TTS/_version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = "0.0.14" diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index ab5754f7..00a20bdf 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -6,12 +6,12 @@ import numpy as np from tqdm import tqdm from TTS.config import load_config +from TTS.config import BaseDatasetConfig, load_config from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor - parser = argparse.ArgumentParser( description='Compute embedding vectors for each wav file in a dataset.' ) @@ -74,6 +74,7 @@ for idx, wav_file in enumerate(tqdm(wav_files)): if speaker_mapping: # save speaker_mapping if target dataset is defined if '.json' not in args.output_path and '.npy' not in args.output_path: + mapping_file_path = os.path.join(args.output_path, "speakers.json") mapping_npy_file_path = os.path.join(args.output_path, "speakers.npy") else: diff --git a/TTS/bin/distribute.py b/TTS/bin/distribute.py index 0bd27275..ea43f88b 100644 --- a/TTS/bin/distribute.py +++ b/TTS/bin/distribute.py @@ -51,7 +51,7 @@ def main(): my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i) command[-1] = "--rank={}".format(i) stdout = None if i == 0 else open(os.devnull, "w") - p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env) + p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env) # pylint: disable=consider-using-with processes.append(p) print(command) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index ace7464a..4eb79d76 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -299,4 +299,5 @@ if __name__ == "__main__": args = parser.parse_args() c = load_config(args.config_path) + c.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel main(args) diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index c9493535..48309dc9 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -10,10 +10,8 @@ import torch from torch.utils.data import DataLoader from TTS.speaker_encoder.dataset import SpeakerEncoderDataset - from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model - from TTS.speaker_encoder.utils.visual import plot_embeddings from TTS.tts.datasets.preprocess import load_meta_data from TTS.utils.arguments import init_training @@ -45,7 +43,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False storage_size=c.storage["storage_size"], sample_from_storage_p=c.storage["sample_from_storage_p"], verbose=verbose, - augmentation_config=c.audio_augmentation + augmentation_config=c.audio_augmentation, ) # sampler = DistributedSampler(dataset) if num_gpus > 1 else None @@ -170,19 +168,18 @@ def main(args): # pylint: disable=redefined-outer-name else: raise Exception("The %s not is a loss supported" % c.loss) - if args.restore_path: checkpoint = torch.load(args.restore_path) try: model.load_state_dict(checkpoint["model"]) - if 'criterion' in checkpoint: + if "criterion" in checkpoint: criterion.load_state_dict(checkpoint["criterion"]) except (KeyError, RuntimeError): print(" > Partial model initialization.") model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint['model'], c) + model_dict = set_init_dict(model_dict, checkpoint["model"], c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: diff --git a/TTS/server/server.py b/TTS/server/server.py index 15a6b292..dc025b32 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -99,7 +99,9 @@ if args.vocoder_path is not None: vocoder_config_path = args.vocoder_config_path # load models -synthesizer = Synthesizer(model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, args.use_cuda) +synthesizer = Synthesizer( + model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, use_cuda=args.use_cuda +) use_multi_speaker = synthesizer.speaker_manager is not None # TODO: set this from SpeakerManager diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index cd95a4f5..6b2b0dd4 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -1,24 +1,25 @@ - import random import numpy as np import torch from torch.utils.data import Dataset + from TTS.speaker_encoder.utils.generic_utils import AugmentWAV, Storage + class SpeakerEncoderDataset(Dataset): def __init__( - self, - ap, - meta_data, - voice_len=1.6, - num_speakers_in_batch=64, - storage_size=1, - sample_from_storage_p=0.5, - num_utter_per_speaker=10, - skip_speakers=False, - verbose=False, - augmentation_config=None + self, + ap, + meta_data, + voice_len=1.6, + num_speakers_in_batch=64, + storage_size=1, + sample_from_storage_p=0.5, + num_utter_per_speaker=10, + skip_speakers=False, + verbose=False, + augmentation_config=None, ): """ Args: @@ -38,23 +39,25 @@ class SpeakerEncoderDataset(Dataset): self.verbose = verbose self.__parse_items() storage_max_size = storage_size * num_speakers_in_batch - self.storage = Storage(maxsize=storage_max_size, storage_batchs=storage_size, num_speakers_in_batch=num_speakers_in_batch) + self.storage = Storage( + maxsize=storage_max_size, storage_batchs=storage_size, num_speakers_in_batch=num_speakers_in_batch + ) self.sample_from_storage_p = float(sample_from_storage_p) speakers_aux = list(self.speakers) speakers_aux.sort() - self.speakerid_to_classid = {key : i for i, key in enumerate(speakers_aux)} + self.speakerid_to_classid = {key: i for i, key in enumerate(speakers_aux)} # Augmentation self.augmentator = None self.gaussian_augmentation_config = None if augmentation_config: - self.data_augmentation_p = augmentation_config['p'] - if self.data_augmentation_p and ('additive' in augmentation_config or 'rir' in augmentation_config): + self.data_augmentation_p = augmentation_config["p"] + if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config): self.augmentator = AugmentWAV(ap, augmentation_config) - if 'gaussian' in augmentation_config.keys(): - self.gaussian_augmentation_config = augmentation_config['gaussian'] + if "gaussian" in augmentation_config.keys(): + self.gaussian_augmentation_config = augmentation_config["gaussian"] if self.verbose: print("\n > DataLoader initialization") @@ -231,9 +234,13 @@ class SpeakerEncoderDataset(Dataset): offset = random.randint(0, wav.shape[0] - self.seq_len) wav = wav[offset : offset + self.seq_len] # add random gaussian noise - if self.gaussian_augmentation_config and self.gaussian_augmentation_config['p']: - if random.random() < self.gaussian_augmentation_config['p']: - wav += np.random.normal(self.gaussian_augmentation_config['min_amplitude'], self.gaussian_augmentation_config['max_amplitude'], size=len(wav)) + if self.gaussian_augmentation_config and self.gaussian_augmentation_config["p"]: + if random.random() < self.gaussian_augmentation_config["p"]: + wav += np.random.normal( + self.gaussian_augmentation_config["min_amplitude"], + self.gaussian_augmentation_config["max_amplitude"], + size=len(wav), + ) mel = self.ap.melspectrogram(wav) feats_.append(torch.FloatTensor(mel)) diff --git a/TTS/speaker_encoder/losses.py b/TTS/speaker_encoder/losses.py index 9b573b6d..ac7e62bf 100644 --- a/TTS/speaker_encoder/losses.py +++ b/TTS/speaker_encoder/losses.py @@ -162,6 +162,7 @@ class AngleProtoLoss(nn.Module): L = self.criterion(cos_sim_matrix, label) return L + class SoftmaxLoss(nn.Module): """ Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982 @@ -169,13 +170,14 @@ class SoftmaxLoss(nn.Module): - embedding_dim (float): speaker embedding dim - n_speakers (float): number of speakers """ + def __init__(self, embedding_dim, n_speakers): super().__init__() self.criterion = torch.nn.CrossEntropyLoss() self.fc = nn.Linear(embedding_dim, n_speakers) - print('Initialised Softmax Loss') + print("Initialised Softmax Loss") def forward(self, x, label=None): # reshape for compatibility @@ -187,6 +189,7 @@ class SoftmaxLoss(nn.Module): return L + class SoftmaxAngleProtoLoss(nn.Module): """ Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153 @@ -196,13 +199,14 @@ class SoftmaxAngleProtoLoss(nn.Module): - init_w (float): defines the initial value of w - init_b (float): definies the initial value of b """ + def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0): super().__init__() self.softmax = SoftmaxLoss(embedding_dim, n_speakers) self.angleproto = AngleProtoLoss(init_w, init_b) - print('Initialised SoftmaxAnglePrototypical Loss') + print("Initialised SoftmaxAnglePrototypical Loss") def forward(self, x, label=None): """ @@ -213,4 +217,4 @@ class SoftmaxAngleProtoLoss(nn.Module): Ls = self.softmax(x, label) - return Ls+Lp + return Ls + Lp diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index aa2171ed..ce86b01f 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -1,7 +1,8 @@ -import torch import numpy as np +import torch import torch.nn as nn + class SELayer(nn.Module): def __init__(self, channel, reduction=8): super(SELayer, self).__init__() @@ -10,7 +11,7 @@ class SELayer(nn.Module): nn.Linear(channel, channel // reduction), nn.ReLU(inplace=True), nn.Linear(channel // reduction, channel), - nn.Sigmoid() + nn.Sigmoid(), ) def forward(self, x): @@ -19,6 +20,7 @@ class SELayer(nn.Module): y = self.fc(y).view(b, c, 1, 1) return x * y + class SEBasicBlock(nn.Module): expansion = 1 @@ -51,12 +53,22 @@ class SEBasicBlock(nn.Module): out = self.relu(out) return out + class ResNetSpeakerEncoder(nn.Module): """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153 Adapted from: https://github.com/clovaai/voxceleb_trainer """ + # pylint: disable=W0102 - def __init__(self, input_dim=64, proj_dim=512, layers=[3, 4, 6, 3], num_filters=[32, 64, 128, 256], encoder_type='ASP', log_input=False): + def __init__( + self, + input_dim=64, + proj_dim=512, + layers=[3, 4, 6, 3], + num_filters=[32, 64, 128, 256], + encoder_type="ASP", + log_input=False, + ): super(ResNetSpeakerEncoder, self).__init__() self.encoder_type = encoder_type @@ -74,7 +86,7 @@ class ResNetSpeakerEncoder(nn.Module): self.instancenorm = nn.InstanceNorm1d(input_dim) - outmap_size = int(self.input_dim/8) + outmap_size = int(self.input_dim / 8) self.attention = nn.Sequential( nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1), @@ -82,14 +94,14 @@ class ResNetSpeakerEncoder(nn.Module): nn.BatchNorm1d(128), nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1), nn.Softmax(dim=2), - ) + ) if self.encoder_type == "SAP": out_dim = num_filters[3] * outmap_size elif self.encoder_type == "ASP": out_dim = num_filters[3] * outmap_size * 2 else: - raise ValueError('Undefined encoder') + raise ValueError("Undefined encoder") self.fc = nn.Linear(out_dim, proj_dim) @@ -98,7 +110,7 @@ class ResNetSpeakerEncoder(nn.Module): def _init_layers(self): for m in self.modules(): if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") elif isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) @@ -107,8 +119,7 @@ class ResNetSpeakerEncoder(nn.Module): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( - nn.Conv2d(self.inplanes, planes * block.expansion, - kernel_size=1, stride=stride, bias=False), + nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(planes * block.expansion), ) @@ -131,7 +142,7 @@ class ResNetSpeakerEncoder(nn.Module): with torch.no_grad(): with torch.cuda.amp.autocast(enabled=False): if self.log_input: - x = (x+1e-6).log() + x = (x + 1e-6).log() x = self.instancenorm(x).unsqueeze(1) x = self.conv1(x) @@ -151,7 +162,7 @@ class ResNetSpeakerEncoder(nn.Module): x = torch.sum(x * w, dim=2) elif self.encoder_type == "ASP": mu = torch.sum(x * w, dim=2) - sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu ** 2).clamp(min=1e-5)) + sg = torch.sqrt((torch.sum((x ** 2) * w, dim=2) - mu ** 2).clamp(min=1e-5)) x = torch.cat((mu, sg), 1) x = x.view(x.size()[0], -1) @@ -172,12 +183,12 @@ class ResNetSpeakerEncoder(nn.Module): if max_len < num_frames: num_frames = max_len - offsets = np.linspace(0, max_len-num_frames, num=num_eval) + offsets = np.linspace(0, max_len - num_frames, num=num_eval) frames_batch = [] for offset in offsets: offset = int(offset) - end_offset = int(offset+num_frames) + end_offset = int(offset + num_frames) frames = x[:, offset:end_offset] frames_batch.append(frames) diff --git a/TTS/speaker_encoder/speaker_encoder_config.py b/TTS/speaker_encoder/speaker_encoder_config.py index 31149822..e830a0f5 100644 --- a/TTS/speaker_encoder/speaker_encoder_config.py +++ b/TTS/speaker_encoder/speaker_encoder_config.py @@ -25,10 +25,7 @@ class SpeakerEncoderConfig(BaseTrainingConfig): } ) - audio_augmentation : dict = field( - default_factory=lambda: { - } - ) + audio_augmentation: dict = field(default_factory=lambda: {}) storage: dict = field( default_factory=lambda: { diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/speaker_encoder/utils/generic_utils.py index 3299f75a..fb61e48e 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/speaker_encoder/utils/generic_utils.py @@ -1,18 +1,18 @@ -import re +import datetime +import glob import os +import random +import re +from multiprocessing import Manager import numpy as np import torch -import glob -import random -import datetime - from scipy import signal -from multiprocessing import Manager from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder + class Storage(object): def __init__(self, maxsize, storage_batchs, num_speakers_in_batch, num_threads=8): # use multiprocessing for threading safe @@ -53,19 +53,19 @@ class Storage(object): return self.storage[random.randint(0, storage_size)] def get_random_sample_fast(self): - '''Call this method only when storage is full''' + """Call this method only when storage is full""" return self.storage[random.randint(0, self.safe_storage_size)] -class AugmentWAV(object): +class AugmentWAV(object): def __init__(self, ap, augmentation_config): self.ap = ap self.use_additive_noise = False - if 'additive' in augmentation_config.keys(): - self.additive_noise_config = augmentation_config['additive'] - additive_path = self.additive_noise_config['sounds_path'] + if "additive" in augmentation_config.keys(): + self.additive_noise_config = augmentation_config["additive"] + additive_path = self.additive_noise_config["sounds_path"] if additive_path: self.use_additive_noise = True # get noise types @@ -74,12 +74,12 @@ class AugmentWAV(object): if isinstance(self.additive_noise_config[key], dict): self.additive_noise_types.append(key) - additive_files = glob.glob(os.path.join(additive_path, '**/*.wav'), recursive=True) + additive_files = glob.glob(os.path.join(additive_path, "**/*.wav"), recursive=True) self.noise_list = {} for wav_file in additive_files: - noise_dir = wav_file.replace(additive_path, '').split(os.sep)[0] + noise_dir = wav_file.replace(additive_path, "").split(os.sep)[0] # ignore not listed directories if noise_dir not in self.additive_noise_types: continue @@ -87,14 +87,16 @@ class AugmentWAV(object): self.noise_list[noise_dir] = [] self.noise_list[noise_dir].append(wav_file) - print(f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}") + print( + f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}" + ) self.use_rir = False - if 'rir' in augmentation_config.keys(): - self.rir_config = augmentation_config['rir'] - if self.rir_config['rir_path']: - self.rir_files = glob.glob(os.path.join(self.rir_config['rir_path'], '**/*.wav'), recursive=True) + if "rir" in augmentation_config.keys(): + self.rir_config = augmentation_config["rir"] + if self.rir_config["rir_path"]: + self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True) self.use_rir = True print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances") @@ -111,9 +113,15 @@ class AugmentWAV(object): def additive_noise(self, noise_type, audio): - clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4) + clean_db = 10 * np.log10(np.mean(audio ** 2) + 1e-4) - noise_list = random.sample(self.noise_list[noise_type], random.randint(self.additive_noise_config[noise_type]['min_num_noises'], self.additive_noise_config[noise_type]['max_num_noises'])) + noise_list = random.sample( + self.noise_list[noise_type], + random.randint( + self.additive_noise_config[noise_type]["min_num_noises"], + self.additive_noise_config[noise_type]["max_num_noises"], + ), + ) audio_len = audio.shape[0] noises_wav = None @@ -123,7 +131,10 @@ class AugmentWAV(object): if noiseaudio.shape[0] < audio_len: continue - noise_snr = random.uniform(self.additive_noise_config[noise_type]['min_snr_in_db'], self.additive_noise_config[noise_type]['max_num_noises']) + noise_snr = random.uniform( + self.additive_noise_config[noise_type]["min_snr_in_db"], + self.additive_noise_config[noise_type]["max_num_noises"], + ) noise_db = 10 * np.log10(np.mean(noiseaudio ** 2) + 1e-4) noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio @@ -144,7 +155,7 @@ class AugmentWAV(object): rir_file = random.choice(self.rir_files) rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate) rir = rir / np.sqrt(np.sum(rir ** 2)) - return signal.convolve(audio, rir, mode=self.rir_config['conv_mode'])[:audio_len] + return signal.convolve(audio, rir, mode=self.rir_config["conv_mode"])[:audio_len] def apply_one(self, audio): noise_type = random.choice(self.global_noise_list) @@ -153,17 +164,25 @@ class AugmentWAV(object): return self.additive_noise(noise_type, audio) + def to_camel(text): text = text.capitalize() return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) + def setup_model(c): - if c.model_params['model_name'].lower() == 'lstm': - model = LSTMSpeakerEncoder(c.model_params["input_dim"], c.model_params["proj_dim"], c.model_params["lstm_dim"], c.model_params["num_lstm_layers"]) - elif c.model_params['model_name'].lower() == 'resnet': + if c.model_params["model_name"].lower() == "lstm": + model = LSTMSpeakerEncoder( + c.model_params["input_dim"], + c.model_params["proj_dim"], + c.model_params["lstm_dim"], + c.model_params["num_lstm_layers"], + ) + elif c.model_params["model_name"].lower() == "resnet": model = ResNetSpeakerEncoder(input_dim=c.model_params["input_dim"], proj_dim=c.model_params["proj_dim"]) return model + def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch): checkpoint_path = "checkpoint_{}.pth.tar".format(current_step) checkpoint_path = os.path.join(out_path, checkpoint_path) diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 23d3f3c1..cff7907e 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -441,3 +441,17 @@ def baker(root_path: str, meta_file: str) -> List[List[str]]: wav_path = os.path.join(root_path, "clips_22", wav_name) items.append([text, wav_path, speaker_name]) return items + + +def kokoro(root_path, meta_file): + """Japanese single-speaker dataset from https://github.com/kaiidams/Kokoro-Speech-Dataset""" + txt_file = os.path.join(root_path, meta_file) + items = [] + speaker_name = "kokoro" + with open(txt_file, "r") as ttf: + for line in ttf: + cols = line.split("|") + wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav") + text = cols[2].replace(" ", "") + items.append([text, wav_file, speaker_name]) + return items diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index fded8f87..525eb8b3 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -255,6 +255,7 @@ class Tacotron2(TacotronAbstract): if self.num_speakers > 1: if not self.embeddings_per_sample: speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + speaker_embeddings = torch.unsqueeze(speaker_embeddings, 0).transpose(1, 2) encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) @@ -277,6 +278,7 @@ class Tacotron2(TacotronAbstract): if self.num_speakers > 1: if not self.embeddings_per_sample: speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + speaker_embeddings = torch.unsqueeze(speaker_embeddings, 0).transpose(1, 2) encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(encoder_outputs) diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 9367e6e2..f9f44167 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -6,6 +6,7 @@ from packaging import version from TTS.tts.utils.text import cleaners from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes +from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes from TTS.tts.utils.text.symbols import _bos, _eos, _punctuations, make_symbols, phonemes, symbols # pylint: disable=unnecessary-comprehension @@ -39,6 +40,11 @@ def text2phone(text, language): if language == "zh-CN": ph = chinese_text_to_phonemes(text) return ph + + if language == "ja-jp": + ph = japanese_text_to_phonemes(text) + return ph + raise ValueError(f" [!] Language {language} is not supported for phonemization.") diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 2eddcdb8..3d2caa97 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -1,18 +1,6 @@ -""" -Cleaners are transformations that run over the input text at both training and eval time. - -Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" -hyperparameter. Some cleaners are English-specific. You'll typically want to use: - 1. "english_cleaners" for English text - 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using - the Unidecode library (https://pypi.python.org/pypi/Unidecode) - 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update - the symbols in symbols.py to match your data). -""" - import re -from unidecode import unidecode +from anyascii import anyascii from TTS.tts.utils.text.chinese_mandarin.numbers import replace_numbers_to_characters_in_text @@ -47,7 +35,7 @@ def collapse_whitespace(text): def convert_to_ascii(text): - return unidecode(text) + return anyascii(text) def remove_aux_symbols(text): diff --git a/TTS/tts/utils/text/japanese/__init__.py b/TTS/tts/utils/text/japanese/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/TTS/tts/utils/text/japanese/phonemizer.py b/TTS/tts/utils/text/japanese/phonemizer.py new file mode 100644 index 00000000..a4629a30 --- /dev/null +++ b/TTS/tts/utils/text/japanese/phonemizer.py @@ -0,0 +1,380 @@ +# Convert Japanese text to phonemes which is +# compatible with Julius https://github.com/julius-speech/segmentation-kit + +import re + +import MeCab + +_CONVRULES = [ + # Conversion of 2 letters + "アァ/ a a", + "イィ/ i i", + "イェ/ i e", + "イャ/ y a", + "ウゥ/ u:", + "エェ/ e e", + "オォ/ o:", + "カァ/ k a:", + "キィ/ k i:", + "クゥ/ k u:", + "クャ/ ky a", + "クュ/ ky u", + "クョ/ ky o", + "ケェ/ k e:", + "コォ/ k o:", + "ガァ/ g a:", + "ギィ/ g i:", + "グゥ/ g u:", + "グャ/ gy a", + "グュ/ gy u", + "グョ/ gy o", + "ゲェ/ g e:", + "ゴォ/ g o:", + "サァ/ s a:", + "シィ/ sh i:", + "スゥ/ s u:", + "スャ/ sh a", + "スュ/ sh u", + "スョ/ sh o", + "セェ/ s e:", + "ソォ/ s o:", + "ザァ/ z a:", + "ジィ/ j i:", + "ズゥ/ z u:", + "ズャ/ zy a", + "ズュ/ zy u", + "ズョ/ zy o", + "ゼェ/ z e:", + "ゾォ/ z o:", + "タァ/ t a:", + "チィ/ ch i:", + "ツァ/ ts a", + "ツィ/ ts i", + "ツゥ/ ts u:", + "ツャ/ ch a", + "ツュ/ ch u", + "ツョ/ ch o", + "ツェ/ ts e", + "ツォ/ ts o", + "テェ/ t e:", + "トォ/ t o:", + "ダァ/ d a:", + "ヂィ/ j i:", + "ヅゥ/ d u:", + "ヅャ/ zy a", + "ヅュ/ zy u", + "ヅョ/ zy o", + "デェ/ d e:", + "ドォ/ d o:", + "ナァ/ n a:", + "ニィ/ n i:", + "ヌゥ/ n u:", + "ヌャ/ ny a", + "ヌュ/ ny u", + "ヌョ/ ny o", + "ネェ/ n e:", + "ノォ/ n o:", + "ハァ/ h a:", + "ヒィ/ h i:", + "フゥ/ f u:", + "フャ/ hy a", + "フュ/ hy u", + "フョ/ hy o", + "ヘェ/ h e:", + "ホォ/ h o:", + "バァ/ b a:", + "ビィ/ b i:", + "ブゥ/ b u:", + "フャ/ hy a", + "ブュ/ by u", + "フョ/ hy o", + "ベェ/ b e:", + "ボォ/ b o:", + "パァ/ p a:", + "ピィ/ p i:", + "プゥ/ p u:", + "プャ/ py a", + "プュ/ py u", + "プョ/ py o", + "ペェ/ p e:", + "ポォ/ p o:", + "マァ/ m a:", + "ミィ/ m i:", + "ムゥ/ m u:", + "ムャ/ my a", + "ムュ/ my u", + "ムョ/ my o", + "メェ/ m e:", + "モォ/ m o:", + "ヤァ/ y a:", + "ユゥ/ y u:", + "ユャ/ y a:", + "ユュ/ y u:", + "ユョ/ y o:", + "ヨォ/ y o:", + "ラァ/ r a:", + "リィ/ r i:", + "ルゥ/ r u:", + "ルャ/ ry a", + "ルュ/ ry u", + "ルョ/ ry o", + "レェ/ r e:", + "ロォ/ r o:", + "ワァ/ w a:", + "ヲォ/ o:", + "ディ/ d i", + "デェ/ d e:", + "デャ/ dy a", + "デュ/ dy u", + "デョ/ dy o", + "ティ/ t i", + "テェ/ t e:", + "テャ/ ty a", + "テュ/ ty u", + "テョ/ ty o", + "スィ/ s i", + "ズァ/ z u a", + "ズィ/ z i", + "ズゥ/ z u", + "ズャ/ zy a", + "ズュ/ zy u", + "ズョ/ zy o", + "ズェ/ z e", + "ズォ/ z o", + "キャ/ ky a", + "キュ/ ky u", + "キョ/ ky o", + "シャ/ sh a", + "シュ/ sh u", + "シェ/ sh e", + "ショ/ sh o", + "チャ/ ch a", + "チュ/ ch u", + "チェ/ ch e", + "チョ/ ch o", + "トゥ/ t u", + "トャ/ ty a", + "トュ/ ty u", + "トョ/ ty o", + "ドァ/ d o a", + "ドゥ/ d u", + "ドャ/ dy a", + "ドュ/ dy u", + "ドョ/ dy o", + "ドォ/ d o:", + "ニャ/ ny a", + "ニュ/ ny u", + "ニョ/ ny o", + "ヒャ/ hy a", + "ヒュ/ hy u", + "ヒョ/ hy o", + "ミャ/ my a", + "ミュ/ my u", + "ミョ/ my o", + "リャ/ ry a", + "リュ/ ry u", + "リョ/ ry o", + "ギャ/ gy a", + "ギュ/ gy u", + "ギョ/ gy o", + "ヂェ/ j e", + "ヂャ/ j a", + "ヂュ/ j u", + "ヂョ/ j o", + "ジェ/ j e", + "ジャ/ j a", + "ジュ/ j u", + "ジョ/ j o", + "ビャ/ by a", + "ビュ/ by u", + "ビョ/ by o", + "ピャ/ py a", + "ピュ/ py u", + "ピョ/ py o", + "ウァ/ u a", + "ウィ/ w i", + "ウェ/ w e", + "ウォ/ w o", + "ファ/ f a", + "フィ/ f i", + "フゥ/ f u", + "フャ/ hy a", + "フュ/ hy u", + "フョ/ hy o", + "フェ/ f e", + "フォ/ f o", + "ヴァ/ b a", + "ヴィ/ b i", + "ヴェ/ b e", + "ヴォ/ b o", + "ヴュ/ by u", + # Conversion of 1 letter + "ア/ a", + "イ/ i", + "ウ/ u", + "エ/ e", + "オ/ o", + "カ/ k a", + "キ/ k i", + "ク/ k u", + "ケ/ k e", + "コ/ k o", + "サ/ s a", + "シ/ sh i", + "ス/ s u", + "セ/ s e", + "ソ/ s o", + "タ/ t a", + "チ/ ch i", + "ツ/ ts u", + "テ/ t e", + "ト/ t o", + "ナ/ n a", + "ニ/ n i", + "ヌ/ n u", + "ネ/ n e", + "ノ/ n o", + "ハ/ h a", + "ヒ/ h i", + "フ/ f u", + "ヘ/ h e", + "ホ/ h o", + "マ/ m a", + "ミ/ m i", + "ム/ m u", + "メ/ m e", + "モ/ m o", + "ラ/ r a", + "リ/ r i", + "ル/ r u", + "レ/ r e", + "ロ/ r o", + "ガ/ g a", + "ギ/ g i", + "グ/ g u", + "ゲ/ g e", + "ゴ/ g o", + "ザ/ z a", + "ジ/ j i", + "ズ/ z u", + "ゼ/ z e", + "ゾ/ z o", + "ダ/ d a", + "ヂ/ j i", + "ヅ/ z u", + "デ/ d e", + "ド/ d o", + "バ/ b a", + "ビ/ b i", + "ブ/ b u", + "ベ/ b e", + "ボ/ b o", + "パ/ p a", + "ピ/ p i", + "プ/ p u", + "ペ/ p e", + "ポ/ p o", + "ヤ/ y a", + "ユ/ y u", + "ヨ/ y o", + "ワ/ w a", + "ヰ/ i", + "ヱ/ e", + "ヲ/ o", + "ン/ N", + "ッ/ q", + "ヴ/ b u", + "ー/:", + # Try converting broken text + "ァ/ a", + "ィ/ i", + "ゥ/ u", + "ェ/ e", + "ォ/ o", + "ヮ/ w a", + "ォ/ o", + # Symbols + "、/ ,", + "。/ .", + "!/ !", + "?/ ?", + "・/ ,", +] + +_COLON_RX = re.compile(":+") +_REJECT_RX = re.compile("[^ a-zA-Z:,.?]") + + +def _makerulemap(): + l = [tuple(x.split("/")) for x in _CONVRULES] + return tuple({k: v for k, v in l if len(k) == i} for i in (1, 2)) + + +_RULEMAP1, _RULEMAP2 = _makerulemap() + + +def kata2phoneme(text: str) -> str: + """Convert katakana text to phonemes.""" + text = text.strip() + res = "" + while text: + if len(text) >= 2: + x = _RULEMAP2.get(text[:2]) + if x is not None: + text = text[2:] + res += x + continue + x = _RULEMAP1.get(text[0]) + if x is not None: + text = text[1:] + res += x + continue + res += " " + text[0] + text = text[1:] + res = _COLON_RX.sub(":", res) + return res[1:] + + +_KATAKANA = "".join(chr(ch) for ch in range(ord("ァ"), ord("ン") + 1)) +_HIRAGANA = "".join(chr(ch) for ch in range(ord("ぁ"), ord("ん") + 1)) +_HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA) + + +def hira2kata(text: str) -> str: + text = text.translate(_HIRA2KATATRANS) + return text.replace("う゛", "ヴ") + + +_SYMBOL_TOKENS = set(list("・、。?!")) +_NO_YOMI_TOKENS = set(list("「」『』―()[][] …")) +_TAGGER = MeCab.Tagger() + + +def text2kata(text: str) -> str: + parsed = _TAGGER.parse(text) + res = [] + for line in parsed.split("\n"): + if line == "EOS": + break + parts = line.split("\t") + + word, yomi = parts[0], parts[1] + if yomi: + res.append(yomi) + else: + if word in _SYMBOL_TOKENS: + res.append(word) + elif word in ("っ", "ッ"): + res.append("ッ") + elif word in _NO_YOMI_TOKENS: + pass + else: + res.append(word) + return hira2kata("".join(res)) + + +def japanese_text_to_phonemes(text: str) -> str: + """Convert Japanese text to phonemes.""" + res = text2kata(text) + res = kata2phoneme(res) + return res.replace(" ", "") diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 1b5a424b..5e6acd1d 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -152,6 +152,7 @@ def process_args(args): experiment_path = create_experiment_folder(config.output_path, config.run_name, args.debug) audio_path = os.path.join(experiment_path, "test_audios") # setup rank 0 process in distributed training + tb_logger = None if args.rank == 0: os.makedirs(audio_path, exist_ok=True) new_fields = {} diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 2e3caa81..cf7df7de 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -149,8 +149,8 @@ class ModelManager(object): def _download_zip_file(file_url, output): """Download the github releases""" r = requests.get(file_url) - z = zipfile.ZipFile(io.BytesIO(r.content)) - z.extractall(output) + with zipfile.ZipFile(io.BytesIO(r.content)) as z: + z.extractall(output) for file_path in z.namelist()[1:]: src_path = os.path.join(output, file_path) dst_path = os.path.join(output, os.path.basename(file_path)) diff --git a/recipes/kokoro/tacotron2-DDC/run.sh b/recipes/kokoro/tacotron2-DDC/run.sh new file mode 100644 index 00000000..86fda642 --- /dev/null +++ b/recipes/kokoro/tacotron2-DDC/run.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# take the scripts's parent's directory to prefix all the output paths. +RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +CORPUS=kokoro-speech-v1_1-small +echo $RUN_DIR +if [ \! -d $RUN_DIR/$CORPUS ] ; then + echo "$RUN_DIR/$CORPUS doesn't exist." + echo "Follow the instruction of https://github.com/kaiidams/Kokoro-Speech-Dataset to make the corpus." + exit 1 +fi +# create train-val splits +shuf $RUN_DIR/$CORPUS/metadata.csv > $RUN_DIR/$CORPUS/metadata_shuf.csv +head -n 8000 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_train.csv +tail -n 812 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_val.csv +# compute dataset mean and variance for normalization +python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/$CORPUS/wavs/ +# training .... +# change the GPU id if needed +CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tacotron.py --config_path $RUN_DIR/tacotron2-DDC.json \ + --coqpit.output_path $RUN_DIR \ + --coqpit.datasets.0.path $RUN_DIR/$CORPUS \ + --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ + --coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \ \ No newline at end of file diff --git a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json new file mode 100644 index 00000000..b3630055 --- /dev/null +++ b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json @@ -0,0 +1,125 @@ +{ + "datasets": [ + { + "name": "kokoro", + "path": "DEFINE THIS", + "meta_file_train": "metadata.csv", + "meta_file_val": null + } + ], + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_length_ms": null, + "frame_shift_ms": null, + "sample_rate": 22050, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_trim_silence": true, + "trim_db": 60, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 50.0, + "mel_fmax": 7600.0, + "spec_gain": 1, + "signal_norm": true, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": "scale_stats.npy" + }, + "gst":{ + "gst_style_input": null, + + + + "gst_embedding_dim": 512, + "gst_num_heads": 4, + "gst_style_tokens": 10, + "gst_use_speaker_embedding": false + }, + "model": "Tacotron2", + "run_name": "kokoro-ddc", + "run_description": "tacotron2 with DDC and differential spectral loss.", + "batch_size": 32, + "eval_batch_size": 16, + "mixed_precision": true, + "distributed": { + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + "reinit_layers": [], + "loss_masking": true, + "decoder_loss_alpha": 0.5, + "postnet_loss_alpha": 0.25, + "postnet_diff_spec_alpha": 0.25, + "decoder_diff_spec_alpha": 0.25, + "decoder_ssim_alpha": 0.5, + "postnet_ssim_alpha": 0.25, + "ga_alpha": 5.0, + "stopnet_pos_weight": 15.0, + "run_eval": true, + "test_delay_epochs": 10, + "test_sentences_file": null, + "noam_schedule": false, + "grad_clip": 1.0, + "epochs": 1000, + "lr": 0.0001, + "wd": 0.000001, + "warmup_steps": 4000, + "seq_len_norm": false, + "memory_size": -1, + "prenet_type": "original", + "prenet_dropout": true, + "attention_type": "original", + "windowing": false, + "use_forward_attn": false, + "forward_attn_mask": false, + "transition_agent": false, + "location_attn": true, + "bidirectional_decoder": false, + "double_decoder_consistency": true, + "ddc_r": 7, + "attention_heads": 4, + "attention_norm": "sigmoid", + "r": 7, + "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], + "stopnet": true, + "separate_stopnet": true, + "print_step": 25, + "tb_plot_step": 100, + "print_eval": false, + "save_step": 10000, + "checkpoint": true, + "keep_all_best": false, + "keep_after": 10000, + "tb_model_param_stats": false, + "text_cleaner": "basic_cleaners", + "enable_eos_bos_chars": false, + "num_loader_workers": 4, + "num_val_loader_workers": 4, + "batch_group_size": 4, + "min_seq_len": 6, + "max_seq_len": 153, + "compute_input_seq_cache": false, + "use_noise_augment": true, + "output_path": "DEFINE THIS", + "phoneme_cache_path": "DEFINE THIS", + "use_phonemes": true, + "phoneme_language": "ja-jp", + "characters": { + "pad": "_", + "eos": "~", + "bos": "^", + "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + "punctuations": "!'(),-.:;? ", + "phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + }, + "use_speaker_embedding": false, + "use_gst": false, + "use_external_speaker_embedding_file": false, + "external_speaker_embedding_file": "../../speakers-vctk-en.json" +} \ No newline at end of file diff --git a/requirements.dev.txt b/requirements.dev.txt index 144a0ed6..afb5ebe6 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -2,4 +2,4 @@ black coverage isort nose -pylint==2.7.4 +pylint==2.8.3 diff --git a/requirements.txt b/requirements.txt index c6ce7672..fde48978 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,5 +17,8 @@ torch>=1.7 tqdm numba==0.52 umap-learn==0.4.6 -unidecode==0.4.20 +anyascii coqpit +# japanese g2p deps +mecab-python3==1.0.3 +unidic-lite==1.0.8 diff --git a/setup.py b/setup.py index a68b09e0..7cfb6519 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,6 @@ import os import subprocess import sys from distutils.version import LooseVersion -from TTS._version import __version__ import numpy import setuptools.command.build_py @@ -12,82 +11,85 @@ import setuptools.command.develop from Cython.Build import cythonize from setuptools import Extension, find_packages, setup + if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.9"): - raise RuntimeError( - "TTS requires python >= 3.6 and <3.9 " - "but your Python version is {}".format(sys.version) - ) + raise RuntimeError("TTS requires python >= 3.6 and <3.9 " "but your Python version is {}".format(sys.version)) -version = __version__ cwd = os.path.dirname(os.path.abspath(__file__)) +cwd = os.path.dirname(os.path.abspath(__file__)) +with open(os.path.join(cwd, "TTS", "VERSION")) as fin: + version = fin.read().strip() + + class build_py(setuptools.command.build_py.build_py): # pylint: disable=too-many-ancestors def run(self): - self.create_version_file() setuptools.command.build_py.build_py.run(self) - @staticmethod - def create_version_file(): - print('-- Building version ' + version) - version_path = os.path.join(cwd, 'version.py') - with open(version_path, 'w') as f: - f.write("__version__ = '{}'\n".format(version)) class develop(setuptools.command.develop.develop): def run(self): - build_py.create_version_file() setuptools.command.develop.develop.run(self) # The documentation for this feature is in server/README.md -package_data = ['TTS/server/templates/*'] +package_data = ["TTS/server/templates/*"] def pip_install(package_name): - subprocess.call([sys.executable, '-m', 'pip', 'install', package_name]) + subprocess.call([sys.executable, "-m", "pip", "install", package_name]) -requirements = open(os.path.join(cwd, 'requirements.txt'), 'r').readlines() -with open(os.path.join(cwd, 'requirements.notebooks.txt'), 'r') as f: +requirements = open(os.path.join(cwd, "requirements.txt"), "r").readlines() +with open(os.path.join(cwd, "requirements.notebooks.txt"), "r") as f: requirements_notebooks = f.readlines() -with open(os.path.join(cwd, 'requirements.dev.txt'), 'r') as f: +with open(os.path.join(cwd, "requirements.dev.txt"), "r") as f: requirements_dev = f.readlines() -with open(os.path.join(cwd, 'requirements.tf.txt'), 'r') as f: +with open(os.path.join(cwd, "requirements.tf.txt"), "r") as f: requirements_tf = f.readlines() requirements_all = requirements_dev + requirements_notebooks + requirements_tf -with open('README.md', "r", encoding="utf-8") as readme_file: +with open("README.md", "r", encoding="utf-8") as readme_file: README = readme_file.read() -exts = [Extension(name='TTS.tts.layers.glow_tts.monotonic_align.core', - sources=["TTS/tts/layers/glow_tts/monotonic_align/core.pyx"])] +exts = [ + Extension( + name="TTS.tts.layers.glow_tts.monotonic_align.core", + sources=["TTS/tts/layers/glow_tts/monotonic_align/core.pyx"], + ) +] setup( - name='TTS', + name="TTS", version=version, - url='https://github.com/coqui-ai/TTS', - author='Eren Gölge', - author_email='egolge@coqui.ai', - description='Deep learning for Text to Speech by Coqui.', + url="https://github.com/coqui-ai/TTS", + author="Eren Gölge", + author_email="egolge@coqui.ai", + description="Deep learning for Text to Speech by Coqui.", long_description=README, long_description_content_type="text/markdown", - license='MPL-2.0', + license="MPL-2.0", # cython include_dirs=numpy.get_include(), ext_modules=cythonize(exts, language_level=3), # ext_modules=find_cython_extensions(), # package include_package_data=True, - packages=find_packages(include=['TTS*']), + packages=find_packages(include=["TTS*"]), + package_data={ + "TTS": [ + "VERSION", + ] + }, project_urls={ - 'Documentation': 'https://github.com/coqui-ai/TTS/wiki', - 'Tracker': 'https://github.com/coqui-ai/TTS/issues', - 'Repository': 'https://github.com/coqui-ai/TTS', - 'Discussions': 'https://github.com/coqui-ai/TTS/discussions', + "Documentation": "https://github.com/coqui-ai/TTS/wiki", + "Tracker": "https://github.com/coqui-ai/TTS/issues", + "Repository": "https://github.com/coqui-ai/TTS", + "Discussions": "https://github.com/coqui-ai/TTS/discussions", }, cmdclass={ - 'build_py': build_py, - 'develop': develop, + "build_py": build_py, + "develop": develop, # 'build_ext': build_ext }, install_requires=requirements, @@ -97,30 +99,25 @@ setup( "notebooks": requirements_notebooks, "tf": requirements_tf, }, - python_requires='>=3.6.0, <3.9', - entry_points={ - 'console_scripts': [ - 'tts=TTS.bin.synthesize:main', - 'tts-server = TTS.server.server:main' - ] - }, + python_requires=">=3.6.0, <3.9", + entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]}, classifiers=[ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", - 'Development Status :: 3 - Alpha', + "Development Status :: 3 - Alpha", "Intended Audience :: Science/Research", "Intended Audience :: Developers", "Operating System :: POSIX :: Linux", - 'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)', + "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)", "Topic :: Software Development", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Multimedia :: Sound/Audio :: Speech", "Topic :: Multimedia :: Sound/Audio", "Topic :: Multimedia", - "Topic :: Scientific/Engineering :: Artificial Intelligence" + "Topic :: Scientific/Engineering :: Artificial Intelligence", ], - zip_safe=False + zip_safe=False, ) diff --git a/tests/test_preprocessors.py b/tests/data_tests/test_dataset_formatters.py similarity index 100% rename from tests/test_preprocessors.py rename to tests/data_tests/test_dataset_formatters.py diff --git a/tests/test_loader.py b/tests/data_tests/test_loader.py similarity index 100% rename from tests/test_loader.py rename to tests/data_tests/test_loader.py diff --git a/tests/test_synthesize.py b/tests/inference_tests/test_synthesize.py similarity index 100% rename from tests/test_synthesize.py rename to tests/inference_tests/test_synthesize.py diff --git a/tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py similarity index 100% rename from tests/test_synthesizer.py rename to tests/inference_tests/test_synthesizer.py diff --git a/tests/test_audio.py b/tests/test_audio_processor.py similarity index 100% rename from tests/test_audio.py rename to tests/test_audio_processor.py diff --git a/tests/test_speaker_encoder.py b/tests/test_speaker_encoder.py index cecbd493..3c897aa9 100644 --- a/tests/test_speaker_encoder.py +++ b/tests/test_speaker_encoder.py @@ -6,6 +6,7 @@ from tests import get_tests_input_path from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder + file_path = get_tests_input_path() @@ -39,6 +40,7 @@ class LSTMSpeakerEncoderTests(unittest.TestCase): assert output.shape[1] == 256 assert len(output.shape) == 2 + class ResNetSpeakerEncoderTests(unittest.TestCase): # pylint: disable=R0201 def test_in_out(self): @@ -65,6 +67,7 @@ class ResNetSpeakerEncoderTests(unittest.TestCase): assert output.shape[1] == 256 assert len(output.shape) == 2 + class GE2ELossTests(unittest.TestCase): # pylint: disable=R0201 def test_in_out(self): @@ -92,6 +95,7 @@ class GE2ELossTests(unittest.TestCase): output = loss.forward(dummy_input) assert output.item() < 0.005 + class AngleProtoLossTests(unittest.TestCase): # pylint: disable=R0201 def test_in_out(self): @@ -121,6 +125,7 @@ class AngleProtoLossTests(unittest.TestCase): output = loss.forward(dummy_input) assert output.item() < 0.005 + class SoftmaxAngleProtoLossTests(unittest.TestCase): # pylint: disable=R0201 def test_in_out(self): diff --git a/tests/test_speaker_encoder_train.py b/tests/test_speaker_encoder_train.py index e168a785..21b12074 100644 --- a/tests/test_speaker_encoder_train.py +++ b/tests/test_speaker_encoder_train.py @@ -46,7 +46,7 @@ run_cli(command_train) shutil.rmtree(continue_path) # test resnet speaker encoder -config.model_params['model_name'] = "resnet" +config.model_params["model_name"] = "resnet" config.save_json(config_path) # train the model for one epoch diff --git a/tests/text_tests/test_japanese_phonemizer.py b/tests/text_tests/test_japanese_phonemizer.py new file mode 100644 index 00000000..b3b1ece3 --- /dev/null +++ b/tests/text_tests/test_japanese_phonemizer.py @@ -0,0 +1,24 @@ +import unittest + +from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes + +_TEST_CASES = """ +どちらに行きますか?/dochiraniikimasuka? +今日は温泉に、行きます。/kyo:waoNseNni,ikimasu. +「A」から「Z」までです。/AkaraZmadedesu. +そうですね!/so:desune! +クジラは哺乳類です。/kujirawahonyu:ruidesu. +ヴィディオを見ます。/bidioomimasu. +ky o: w a o N s e N n i , i k i m a s u ./kyo:waoNseNni,ikimasu. +""" + + +class TestText(unittest.TestCase): + def test_japanese_text_to_phonemes(self): + for line in _TEST_CASES.strip().split("\n"): + text, phone = line.split("/") + self.assertEqual(japanese_text_to_phonemes(text), phone) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_symbols.py b/tests/text_tests/test_symbols.py similarity index 100% rename from tests/test_symbols.py rename to tests/text_tests/test_symbols.py diff --git a/tests/test_text_cleaners.py b/tests/text_tests/test_text_cleaners.py similarity index 100% rename from tests/test_text_cleaners.py rename to tests/text_tests/test_text_cleaners.py diff --git a/tests/test_feed_forward_layers.py b/tests/tts_tests/test_feed_forward_layers.py similarity index 100% rename from tests/test_feed_forward_layers.py rename to tests/tts_tests/test_feed_forward_layers.py diff --git a/tests/test_glow_tts.py b/tests/tts_tests/test_glow_tts.py similarity index 100% rename from tests/test_glow_tts.py rename to tests/tts_tests/test_glow_tts.py diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py index 00c7e852..2e675d13 100644 --- a/tests/tts_tests/test_glow_tts_train.py +++ b/tests/tts_tests/test_glow_tts_train.py @@ -17,7 +17,7 @@ config = GlowTTSConfig( text_cleaner="english_cleaners", use_phonemes=True, phoneme_language="zh-CN", - phoneme_cache_path='tests/data/ljspeech/phoneme_cache/', + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, test_delay_epochs=-1, epochs=1, diff --git a/tests/test_speedy_speech_layers.py b/tests/tts_tests/test_speedy_speech_layers.py similarity index 100% rename from tests/test_speedy_speech_layers.py rename to tests/tts_tests/test_speedy_speech_layers.py diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index cc2845c2..3f508117 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -17,7 +17,7 @@ config = SpeedySpeechConfig( text_cleaner="english_cleaners", use_phonemes=True, phoneme_language="zh-CN", - phoneme_cache_path='tests/data/ljspeech/phoneme_cache/', + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, test_delay_epochs=-1, epochs=1, diff --git a/tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py similarity index 100% rename from tests/test_tacotron2_model.py rename to tests/tts_tests/test_tacotron2_model.py diff --git a/tests/test_tacotron2_tf_model.py b/tests/tts_tests/test_tacotron2_tf_model.py similarity index 100% rename from tests/test_tacotron2_tf_model.py rename to tests/tts_tests/test_tacotron2_tf_model.py diff --git a/tests/test_layers.py b/tests/tts_tests/test_tacotron_layers.py similarity index 100% rename from tests/test_layers.py rename to tests/tts_tests/test_tacotron_layers.py diff --git a/tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py similarity index 100% rename from tests/test_tacotron_model.py rename to tests/tts_tests/test_tacotron_model.py diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py index d9bc51ac..2b286b91 100644 --- a/tests/vocoder_tests/test_fullband_melgan_train.py +++ b/tests/vocoder_tests/test_fullband_melgan_train.py @@ -20,6 +20,7 @@ config = FullbandMelganConfig( eval_split_size=1, print_step=1, print_eval=True, + discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, data_path="tests/data/ljspeech", output_path=output_path, ) diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py index de48ca24..3ff65b5a 100644 --- a/tests/vocoder_tests/test_melgan_train.py +++ b/tests/vocoder_tests/test_melgan_train.py @@ -19,6 +19,7 @@ config = MelganConfig( seq_len=2048, eval_split_size=1, print_step=1, + discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, print_eval=True, data_path="tests/data/ljspeech", output_path=output_path,