fix Lint checks

This commit is contained in:
Edresson 2021-06-18 14:33:50 -03:00
commit 28bec238ca
51 changed files with 814 additions and 147 deletions

38
.github/workflows/pypi-release.yml vendored Normal file
View File

@ -0,0 +1,38 @@
name: Publish Python 🐍 distributions 📦 to PyPI
on:
release:
types: [published]
defaults:
run:
shell:
bash
jobs:
build-package:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- name: Verify tag matches version
run: |
set -ex
version=$(cat TTS/VERSION)
tag="${GITHUB_REF/refs\/tags\/}"
if [[ "v$version" != "$tag" ]]; then
exit 1
fi
- uses: actions/setup-python@v2
with:
python-version: 3.8
- run: |
python -m pip install -U pip setuptools twine toml
python -c 'import toml; c = toml.load("pyproject.toml"); print("\n".join(c["build-system"]["requires"]))' | pip install -r /dev/stdin
- run: |
python setup.py sdist
- name: Setup PyPI config
run: |
cat << EOF > ~/.pypirc
[pypi]
username=__token__
password=${{ secrets.PYPI_TOKEN }}
EOF
- run: |
twine upload --repository pypi dist/*.tar.gz

View File

@ -158,7 +158,8 @@ disable=missing-docstring,
deprecated-sys-function, deprecated-sys-function,
exception-escape, exception-escape,
comprehension-escape, comprehension-escape,
duplicate-code duplicate-code,
not-callable
# Enable the message, report, category or checker with the given id(s). You can # Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option # either give multiple identifier separated by comma (,) or put this option
@ -253,7 +254,7 @@ contextmanager-decorators=contextlib.contextmanager
# List of members which are set dynamically and missed by pylint inference # List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E1101 when accessed. Python regular # system, and so shouldn't trigger E1101 when accessed. Python regular
# expressions are accepted. # expressions are accepted.
generated-members= generated-members=numpy.*,torch.*
# Tells whether missing members accessed in mixin class should be ignored. A # Tells whether missing members accessed in mixin class should be ignored. A
# mixin class is detected if its name ends with "mixin" (case insensitive). # mixin class is detected if its name ends with "mixin" (case insensitive).

View File

@ -1,6 +1,7 @@
include README.md include README.md
include LICENSE.txt include LICENSE.txt
include requirements.*.txt include requirements.*.txt
include TTS/VERSION
recursive-include TTS *.json recursive-include TTS *.json
recursive-include TTS *.html recursive-include TTS *.html
recursive-include TTS *.png recursive-include TTS *.png

View File

@ -149,6 +149,18 @@
"needs_phonemizer": true "needs_phonemizer": true
} }
} }
},
"ja":{
"kokoro":{
"tacotron2-DDC":{
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.15/tts_models--jp--kokoro--tacotron2-DDC.zip",
"default_vocoder": "vocoder_models/universal/libri-tts/wavegrad",
"description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
"author": "@kaiidams",
"commit": "401fbd89",
"needs_phonemizer": false
}
}
} }
}, },
"vocoder_models":{ "vocoder_models":{

1
TTS/VERSION Normal file
View File

@ -0,0 +1 @@
0.0.15

View File

@ -1 +1,7 @@
from ._version import __version__ import os
with open(os.path.join(os.path.dirname(__file__), "VERSION")) as f:
version = f.read().strip()
__version__ = version

View File

@ -1 +0,0 @@
__version__ = "0.0.14"

View File

@ -6,12 +6,12 @@ import numpy as np
from tqdm import tqdm from tqdm import tqdm
from TTS.config import load_config from TTS.config import load_config
from TTS.config import BaseDatasetConfig, load_config
from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.speaker_encoder.utils.generic_utils import setup_model
from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.preprocess import load_meta_data
from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='Compute embedding vectors for each wav file in a dataset.' description='Compute embedding vectors for each wav file in a dataset.'
) )
@ -74,6 +74,7 @@ for idx, wav_file in enumerate(tqdm(wav_files)):
if speaker_mapping: if speaker_mapping:
# save speaker_mapping if target dataset is defined # save speaker_mapping if target dataset is defined
if '.json' not in args.output_path and '.npy' not in args.output_path: if '.json' not in args.output_path and '.npy' not in args.output_path:
mapping_file_path = os.path.join(args.output_path, "speakers.json") mapping_file_path = os.path.join(args.output_path, "speakers.json")
mapping_npy_file_path = os.path.join(args.output_path, "speakers.npy") mapping_npy_file_path = os.path.join(args.output_path, "speakers.npy")
else: else:

View File

@ -51,7 +51,7 @@ def main():
my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i) my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
command[-1] = "--rank={}".format(i) command[-1] = "--rank={}".format(i)
stdout = None if i == 0 else open(os.devnull, "w") stdout = None if i == 0 else open(os.devnull, "w")
p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env) p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env) # pylint: disable=consider-using-with
processes.append(p) processes.append(p)
print(command) print(command)

View File

@ -299,4 +299,5 @@ if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
c = load_config(args.config_path) c = load_config(args.config_path)
c.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel
main(args) main(args)

View File

@ -10,10 +10,8 @@ import torch
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from TTS.speaker_encoder.dataset import SpeakerEncoderDataset from TTS.speaker_encoder.dataset import SpeakerEncoderDataset
from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model
from TTS.speaker_encoder.utils.visual import plot_embeddings from TTS.speaker_encoder.utils.visual import plot_embeddings
from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.preprocess import load_meta_data
from TTS.utils.arguments import init_training from TTS.utils.arguments import init_training
@ -45,7 +43,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False
storage_size=c.storage["storage_size"], storage_size=c.storage["storage_size"],
sample_from_storage_p=c.storage["sample_from_storage_p"], sample_from_storage_p=c.storage["sample_from_storage_p"],
verbose=verbose, verbose=verbose,
augmentation_config=c.audio_augmentation augmentation_config=c.audio_augmentation,
) )
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None # sampler = DistributedSampler(dataset) if num_gpus > 1 else None
@ -170,19 +168,18 @@ def main(args): # pylint: disable=redefined-outer-name
else: else:
raise Exception("The %s not is a loss supported" % c.loss) raise Exception("The %s not is a loss supported" % c.loss)
if args.restore_path: if args.restore_path:
checkpoint = torch.load(args.restore_path) checkpoint = torch.load(args.restore_path)
try: try:
model.load_state_dict(checkpoint["model"]) model.load_state_dict(checkpoint["model"])
if 'criterion' in checkpoint: if "criterion" in checkpoint:
criterion.load_state_dict(checkpoint["criterion"]) criterion.load_state_dict(checkpoint["criterion"])
except (KeyError, RuntimeError): except (KeyError, RuntimeError):
print(" > Partial model initialization.") print(" > Partial model initialization.")
model_dict = model.state_dict() model_dict = model.state_dict()
model_dict = set_init_dict(model_dict, checkpoint['model'], c) model_dict = set_init_dict(model_dict, checkpoint["model"], c)
model.load_state_dict(model_dict) model.load_state_dict(model_dict)
del model_dict del model_dict
for group in optimizer.param_groups: for group in optimizer.param_groups:

View File

@ -99,7 +99,9 @@ if args.vocoder_path is not None:
vocoder_config_path = args.vocoder_config_path vocoder_config_path = args.vocoder_config_path
# load models # load models
synthesizer = Synthesizer(model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, args.use_cuda) synthesizer = Synthesizer(
model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, use_cuda=args.use_cuda
)
use_multi_speaker = synthesizer.speaker_manager is not None use_multi_speaker = synthesizer.speaker_manager is not None
# TODO: set this from SpeakerManager # TODO: set this from SpeakerManager

View File

@ -1,11 +1,12 @@
import random import random
import numpy as np import numpy as np
import torch import torch
from torch.utils.data import Dataset from torch.utils.data import Dataset
from TTS.speaker_encoder.utils.generic_utils import AugmentWAV, Storage from TTS.speaker_encoder.utils.generic_utils import AugmentWAV, Storage
class SpeakerEncoderDataset(Dataset): class SpeakerEncoderDataset(Dataset):
def __init__( def __init__(
self, self,
@ -18,7 +19,7 @@ class SpeakerEncoderDataset(Dataset):
num_utter_per_speaker=10, num_utter_per_speaker=10,
skip_speakers=False, skip_speakers=False,
verbose=False, verbose=False,
augmentation_config=None augmentation_config=None,
): ):
""" """
Args: Args:
@ -38,7 +39,9 @@ class SpeakerEncoderDataset(Dataset):
self.verbose = verbose self.verbose = verbose
self.__parse_items() self.__parse_items()
storage_max_size = storage_size * num_speakers_in_batch storage_max_size = storage_size * num_speakers_in_batch
self.storage = Storage(maxsize=storage_max_size, storage_batchs=storage_size, num_speakers_in_batch=num_speakers_in_batch) self.storage = Storage(
maxsize=storage_max_size, storage_batchs=storage_size, num_speakers_in_batch=num_speakers_in_batch
)
self.sample_from_storage_p = float(sample_from_storage_p) self.sample_from_storage_p = float(sample_from_storage_p)
speakers_aux = list(self.speakers) speakers_aux = list(self.speakers)
@ -49,12 +52,12 @@ class SpeakerEncoderDataset(Dataset):
self.augmentator = None self.augmentator = None
self.gaussian_augmentation_config = None self.gaussian_augmentation_config = None
if augmentation_config: if augmentation_config:
self.data_augmentation_p = augmentation_config['p'] self.data_augmentation_p = augmentation_config["p"]
if self.data_augmentation_p and ('additive' in augmentation_config or 'rir' in augmentation_config): if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
self.augmentator = AugmentWAV(ap, augmentation_config) self.augmentator = AugmentWAV(ap, augmentation_config)
if 'gaussian' in augmentation_config.keys(): if "gaussian" in augmentation_config.keys():
self.gaussian_augmentation_config = augmentation_config['gaussian'] self.gaussian_augmentation_config = augmentation_config["gaussian"]
if self.verbose: if self.verbose:
print("\n > DataLoader initialization") print("\n > DataLoader initialization")
@ -231,9 +234,13 @@ class SpeakerEncoderDataset(Dataset):
offset = random.randint(0, wav.shape[0] - self.seq_len) offset = random.randint(0, wav.shape[0] - self.seq_len)
wav = wav[offset : offset + self.seq_len] wav = wav[offset : offset + self.seq_len]
# add random gaussian noise # add random gaussian noise
if self.gaussian_augmentation_config and self.gaussian_augmentation_config['p']: if self.gaussian_augmentation_config and self.gaussian_augmentation_config["p"]:
if random.random() < self.gaussian_augmentation_config['p']: if random.random() < self.gaussian_augmentation_config["p"]:
wav += np.random.normal(self.gaussian_augmentation_config['min_amplitude'], self.gaussian_augmentation_config['max_amplitude'], size=len(wav)) wav += np.random.normal(
self.gaussian_augmentation_config["min_amplitude"],
self.gaussian_augmentation_config["max_amplitude"],
size=len(wav),
)
mel = self.ap.melspectrogram(wav) mel = self.ap.melspectrogram(wav)
feats_.append(torch.FloatTensor(mel)) feats_.append(torch.FloatTensor(mel))

View File

@ -162,6 +162,7 @@ class AngleProtoLoss(nn.Module):
L = self.criterion(cos_sim_matrix, label) L = self.criterion(cos_sim_matrix, label)
return L return L
class SoftmaxLoss(nn.Module): class SoftmaxLoss(nn.Module):
""" """
Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982 Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982
@ -169,13 +170,14 @@ class SoftmaxLoss(nn.Module):
- embedding_dim (float): speaker embedding dim - embedding_dim (float): speaker embedding dim
- n_speakers (float): number of speakers - n_speakers (float): number of speakers
""" """
def __init__(self, embedding_dim, n_speakers): def __init__(self, embedding_dim, n_speakers):
super().__init__() super().__init__()
self.criterion = torch.nn.CrossEntropyLoss() self.criterion = torch.nn.CrossEntropyLoss()
self.fc = nn.Linear(embedding_dim, n_speakers) self.fc = nn.Linear(embedding_dim, n_speakers)
print('Initialised Softmax Loss') print("Initialised Softmax Loss")
def forward(self, x, label=None): def forward(self, x, label=None):
# reshape for compatibility # reshape for compatibility
@ -187,6 +189,7 @@ class SoftmaxLoss(nn.Module):
return L return L
class SoftmaxAngleProtoLoss(nn.Module): class SoftmaxAngleProtoLoss(nn.Module):
""" """
Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153 Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
@ -196,13 +199,14 @@ class SoftmaxAngleProtoLoss(nn.Module):
- init_w (float): defines the initial value of w - init_w (float): defines the initial value of w
- init_b (float): definies the initial value of b - init_b (float): definies the initial value of b
""" """
def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0): def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0):
super().__init__() super().__init__()
self.softmax = SoftmaxLoss(embedding_dim, n_speakers) self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
self.angleproto = AngleProtoLoss(init_w, init_b) self.angleproto = AngleProtoLoss(init_w, init_b)
print('Initialised SoftmaxAnglePrototypical Loss') print("Initialised SoftmaxAnglePrototypical Loss")
def forward(self, x, label=None): def forward(self, x, label=None):
""" """

View File

@ -1,7 +1,8 @@
import torch
import numpy as np import numpy as np
import torch
import torch.nn as nn import torch.nn as nn
class SELayer(nn.Module): class SELayer(nn.Module):
def __init__(self, channel, reduction=8): def __init__(self, channel, reduction=8):
super(SELayer, self).__init__() super(SELayer, self).__init__()
@ -10,7 +11,7 @@ class SELayer(nn.Module):
nn.Linear(channel, channel // reduction), nn.Linear(channel, channel // reduction),
nn.ReLU(inplace=True), nn.ReLU(inplace=True),
nn.Linear(channel // reduction, channel), nn.Linear(channel // reduction, channel),
nn.Sigmoid() nn.Sigmoid(),
) )
def forward(self, x): def forward(self, x):
@ -19,6 +20,7 @@ class SELayer(nn.Module):
y = self.fc(y).view(b, c, 1, 1) y = self.fc(y).view(b, c, 1, 1)
return x * y return x * y
class SEBasicBlock(nn.Module): class SEBasicBlock(nn.Module):
expansion = 1 expansion = 1
@ -51,12 +53,22 @@ class SEBasicBlock(nn.Module):
out = self.relu(out) out = self.relu(out)
return out return out
class ResNetSpeakerEncoder(nn.Module): class ResNetSpeakerEncoder(nn.Module):
"""Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153 """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
Adapted from: https://github.com/clovaai/voxceleb_trainer Adapted from: https://github.com/clovaai/voxceleb_trainer
""" """
# pylint: disable=W0102 # pylint: disable=W0102
def __init__(self, input_dim=64, proj_dim=512, layers=[3, 4, 6, 3], num_filters=[32, 64, 128, 256], encoder_type='ASP', log_input=False): def __init__(
self,
input_dim=64,
proj_dim=512,
layers=[3, 4, 6, 3],
num_filters=[32, 64, 128, 256],
encoder_type="ASP",
log_input=False,
):
super(ResNetSpeakerEncoder, self).__init__() super(ResNetSpeakerEncoder, self).__init__()
self.encoder_type = encoder_type self.encoder_type = encoder_type
@ -89,7 +101,7 @@ class ResNetSpeakerEncoder(nn.Module):
elif self.encoder_type == "ASP": elif self.encoder_type == "ASP":
out_dim = num_filters[3] * outmap_size * 2 out_dim = num_filters[3] * outmap_size * 2
else: else:
raise ValueError('Undefined encoder') raise ValueError("Undefined encoder")
self.fc = nn.Linear(out_dim, proj_dim) self.fc = nn.Linear(out_dim, proj_dim)
@ -98,7 +110,7 @@ class ResNetSpeakerEncoder(nn.Module):
def _init_layers(self): def _init_layers(self):
for m in self.modules(): for m in self.modules():
if isinstance(m, nn.Conv2d): if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
elif isinstance(m, nn.BatchNorm2d): elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1) nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0) nn.init.constant_(m.bias, 0)
@ -107,8 +119,7 @@ class ResNetSpeakerEncoder(nn.Module):
downsample = None downsample = None
if stride != 1 or self.inplanes != planes * block.expansion: if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential( downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion, nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes * block.expansion), nn.BatchNorm2d(planes * block.expansion),
) )

View File

@ -25,10 +25,7 @@ class SpeakerEncoderConfig(BaseTrainingConfig):
} }
) )
audio_augmentation : dict = field( audio_augmentation: dict = field(default_factory=lambda: {})
default_factory=lambda: {
}
)
storage: dict = field( storage: dict = field(
default_factory=lambda: { default_factory=lambda: {

View File

@ -1,18 +1,18 @@
import re import datetime
import glob
import os import os
import random
import re
from multiprocessing import Manager
import numpy as np import numpy as np
import torch import torch
import glob
import random
import datetime
from scipy import signal from scipy import signal
from multiprocessing import Manager
from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder
from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder
class Storage(object): class Storage(object):
def __init__(self, maxsize, storage_batchs, num_speakers_in_batch, num_threads=8): def __init__(self, maxsize, storage_batchs, num_speakers_in_batch, num_threads=8):
# use multiprocessing for threading safe # use multiprocessing for threading safe
@ -53,19 +53,19 @@ class Storage(object):
return self.storage[random.randint(0, storage_size)] return self.storage[random.randint(0, storage_size)]
def get_random_sample_fast(self): def get_random_sample_fast(self):
'''Call this method only when storage is full''' """Call this method only when storage is full"""
return self.storage[random.randint(0, self.safe_storage_size)] return self.storage[random.randint(0, self.safe_storage_size)]
class AugmentWAV(object):
class AugmentWAV(object):
def __init__(self, ap, augmentation_config): def __init__(self, ap, augmentation_config):
self.ap = ap self.ap = ap
self.use_additive_noise = False self.use_additive_noise = False
if 'additive' in augmentation_config.keys(): if "additive" in augmentation_config.keys():
self.additive_noise_config = augmentation_config['additive'] self.additive_noise_config = augmentation_config["additive"]
additive_path = self.additive_noise_config['sounds_path'] additive_path = self.additive_noise_config["sounds_path"]
if additive_path: if additive_path:
self.use_additive_noise = True self.use_additive_noise = True
# get noise types # get noise types
@ -74,12 +74,12 @@ class AugmentWAV(object):
if isinstance(self.additive_noise_config[key], dict): if isinstance(self.additive_noise_config[key], dict):
self.additive_noise_types.append(key) self.additive_noise_types.append(key)
additive_files = glob.glob(os.path.join(additive_path, '**/*.wav'), recursive=True) additive_files = glob.glob(os.path.join(additive_path, "**/*.wav"), recursive=True)
self.noise_list = {} self.noise_list = {}
for wav_file in additive_files: for wav_file in additive_files:
noise_dir = wav_file.replace(additive_path, '').split(os.sep)[0] noise_dir = wav_file.replace(additive_path, "").split(os.sep)[0]
# ignore not listed directories # ignore not listed directories
if noise_dir not in self.additive_noise_types: if noise_dir not in self.additive_noise_types:
continue continue
@ -87,14 +87,16 @@ class AugmentWAV(object):
self.noise_list[noise_dir] = [] self.noise_list[noise_dir] = []
self.noise_list[noise_dir].append(wav_file) self.noise_list[noise_dir].append(wav_file)
print(f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}") print(
f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}"
)
self.use_rir = False self.use_rir = False
if 'rir' in augmentation_config.keys(): if "rir" in augmentation_config.keys():
self.rir_config = augmentation_config['rir'] self.rir_config = augmentation_config["rir"]
if self.rir_config['rir_path']: if self.rir_config["rir_path"]:
self.rir_files = glob.glob(os.path.join(self.rir_config['rir_path'], '**/*.wav'), recursive=True) self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True)
self.use_rir = True self.use_rir = True
print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances") print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances")
@ -113,7 +115,13 @@ class AugmentWAV(object):
clean_db = 10 * np.log10(np.mean(audio ** 2) + 1e-4) clean_db = 10 * np.log10(np.mean(audio ** 2) + 1e-4)
noise_list = random.sample(self.noise_list[noise_type], random.randint(self.additive_noise_config[noise_type]['min_num_noises'], self.additive_noise_config[noise_type]['max_num_noises'])) noise_list = random.sample(
self.noise_list[noise_type],
random.randint(
self.additive_noise_config[noise_type]["min_num_noises"],
self.additive_noise_config[noise_type]["max_num_noises"],
),
)
audio_len = audio.shape[0] audio_len = audio.shape[0]
noises_wav = None noises_wav = None
@ -123,7 +131,10 @@ class AugmentWAV(object):
if noiseaudio.shape[0] < audio_len: if noiseaudio.shape[0] < audio_len:
continue continue
noise_snr = random.uniform(self.additive_noise_config[noise_type]['min_snr_in_db'], self.additive_noise_config[noise_type]['max_num_noises']) noise_snr = random.uniform(
self.additive_noise_config[noise_type]["min_snr_in_db"],
self.additive_noise_config[noise_type]["max_num_noises"],
)
noise_db = 10 * np.log10(np.mean(noiseaudio ** 2) + 1e-4) noise_db = 10 * np.log10(np.mean(noiseaudio ** 2) + 1e-4)
noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio
@ -144,7 +155,7 @@ class AugmentWAV(object):
rir_file = random.choice(self.rir_files) rir_file = random.choice(self.rir_files)
rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate) rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate)
rir = rir / np.sqrt(np.sum(rir ** 2)) rir = rir / np.sqrt(np.sum(rir ** 2))
return signal.convolve(audio, rir, mode=self.rir_config['conv_mode'])[:audio_len] return signal.convolve(audio, rir, mode=self.rir_config["conv_mode"])[:audio_len]
def apply_one(self, audio): def apply_one(self, audio):
noise_type = random.choice(self.global_noise_list) noise_type = random.choice(self.global_noise_list)
@ -153,17 +164,25 @@ class AugmentWAV(object):
return self.additive_noise(noise_type, audio) return self.additive_noise(noise_type, audio)
def to_camel(text): def to_camel(text):
text = text.capitalize() text = text.capitalize()
return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
def setup_model(c): def setup_model(c):
if c.model_params['model_name'].lower() == 'lstm': if c.model_params["model_name"].lower() == "lstm":
model = LSTMSpeakerEncoder(c.model_params["input_dim"], c.model_params["proj_dim"], c.model_params["lstm_dim"], c.model_params["num_lstm_layers"]) model = LSTMSpeakerEncoder(
elif c.model_params['model_name'].lower() == 'resnet': c.model_params["input_dim"],
c.model_params["proj_dim"],
c.model_params["lstm_dim"],
c.model_params["num_lstm_layers"],
)
elif c.model_params["model_name"].lower() == "resnet":
model = ResNetSpeakerEncoder(input_dim=c.model_params["input_dim"], proj_dim=c.model_params["proj_dim"]) model = ResNetSpeakerEncoder(input_dim=c.model_params["input_dim"], proj_dim=c.model_params["proj_dim"])
return model return model
def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch): def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
checkpoint_path = "checkpoint_{}.pth.tar".format(current_step) checkpoint_path = "checkpoint_{}.pth.tar".format(current_step)
checkpoint_path = os.path.join(out_path, checkpoint_path) checkpoint_path = os.path.join(out_path, checkpoint_path)

View File

@ -441,3 +441,17 @@ def baker(root_path: str, meta_file: str) -> List[List[str]]:
wav_path = os.path.join(root_path, "clips_22", wav_name) wav_path = os.path.join(root_path, "clips_22", wav_name)
items.append([text, wav_path, speaker_name]) items.append([text, wav_path, speaker_name])
return items return items
def kokoro(root_path, meta_file):
"""Japanese single-speaker dataset from https://github.com/kaiidams/Kokoro-Speech-Dataset"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "kokoro"
with open(txt_file, "r") as ttf:
for line in ttf:
cols = line.split("|")
wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
text = cols[2].replace(" ", "")
items.append([text, wav_file, speaker_name])
return items

View File

@ -255,6 +255,7 @@ class Tacotron2(TacotronAbstract):
if self.num_speakers > 1: if self.num_speakers > 1:
if not self.embeddings_per_sample: if not self.embeddings_per_sample:
speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
speaker_embeddings = torch.unsqueeze(speaker_embeddings, 0).transpose(1, 2)
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs)
@ -277,6 +278,7 @@ class Tacotron2(TacotronAbstract):
if self.num_speakers > 1: if self.num_speakers > 1:
if not self.embeddings_per_sample: if not self.embeddings_per_sample:
speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
speaker_embeddings = torch.unsqueeze(speaker_embeddings, 0).transpose(1, 2)
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(encoder_outputs) mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(encoder_outputs)

View File

@ -6,6 +6,7 @@ from packaging import version
from TTS.tts.utils.text import cleaners from TTS.tts.utils.text import cleaners
from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes
from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
from TTS.tts.utils.text.symbols import _bos, _eos, _punctuations, make_symbols, phonemes, symbols from TTS.tts.utils.text.symbols import _bos, _eos, _punctuations, make_symbols, phonemes, symbols
# pylint: disable=unnecessary-comprehension # pylint: disable=unnecessary-comprehension
@ -39,6 +40,11 @@ def text2phone(text, language):
if language == "zh-CN": if language == "zh-CN":
ph = chinese_text_to_phonemes(text) ph = chinese_text_to_phonemes(text)
return ph return ph
if language == "ja-jp":
ph = japanese_text_to_phonemes(text)
return ph
raise ValueError(f" [!] Language {language} is not supported for phonemization.") raise ValueError(f" [!] Language {language} is not supported for phonemization.")

View File

@ -1,18 +1,6 @@
"""
Cleaners are transformations that run over the input text at both training and eval time.
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
1. "english_cleaners" for English text
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
the symbols in symbols.py to match your data).
"""
import re import re
from unidecode import unidecode from anyascii import anyascii
from TTS.tts.utils.text.chinese_mandarin.numbers import replace_numbers_to_characters_in_text from TTS.tts.utils.text.chinese_mandarin.numbers import replace_numbers_to_characters_in_text
@ -47,7 +35,7 @@ def collapse_whitespace(text):
def convert_to_ascii(text): def convert_to_ascii(text):
return unidecode(text) return anyascii(text)
def remove_aux_symbols(text): def remove_aux_symbols(text):

View File

View File

@ -0,0 +1,380 @@
# Convert Japanese text to phonemes which is
# compatible with Julius https://github.com/julius-speech/segmentation-kit
import re
import MeCab
_CONVRULES = [
# Conversion of 2 letters
"アァ/ a a",
"イィ/ i i",
"イェ/ i e",
"イャ/ y a",
"ウゥ/ u:",
"エェ/ e e",
"オォ/ o:",
"カァ/ k a:",
"キィ/ k i:",
"クゥ/ k u:",
"クャ/ ky a",
"クュ/ ky u",
"クョ/ ky o",
"ケェ/ k e:",
"コォ/ k o:",
"ガァ/ g a:",
"ギィ/ g i:",
"グゥ/ g u:",
"グャ/ gy a",
"グュ/ gy u",
"グョ/ gy o",
"ゲェ/ g e:",
"ゴォ/ g o:",
"サァ/ s a:",
"シィ/ sh i:",
"スゥ/ s u:",
"スャ/ sh a",
"スュ/ sh u",
"スョ/ sh o",
"セェ/ s e:",
"ソォ/ s o:",
"ザァ/ z a:",
"ジィ/ j i:",
"ズゥ/ z u:",
"ズャ/ zy a",
"ズュ/ zy u",
"ズョ/ zy o",
"ゼェ/ z e:",
"ゾォ/ z o:",
"タァ/ t a:",
"チィ/ ch i:",
"ツァ/ ts a",
"ツィ/ ts i",
"ツゥ/ ts u:",
"ツャ/ ch a",
"ツュ/ ch u",
"ツョ/ ch o",
"ツェ/ ts e",
"ツォ/ ts o",
"テェ/ t e:",
"トォ/ t o:",
"ダァ/ d a:",
"ヂィ/ j i:",
"ヅゥ/ d u:",
"ヅャ/ zy a",
"ヅュ/ zy u",
"ヅョ/ zy o",
"デェ/ d e:",
"ドォ/ d o:",
"ナァ/ n a:",
"ニィ/ n i:",
"ヌゥ/ n u:",
"ヌャ/ ny a",
"ヌュ/ ny u",
"ヌョ/ ny o",
"ネェ/ n e:",
"ノォ/ n o:",
"ハァ/ h a:",
"ヒィ/ h i:",
"フゥ/ f u:",
"フャ/ hy a",
"フュ/ hy u",
"フョ/ hy o",
"ヘェ/ h e:",
"ホォ/ h o:",
"バァ/ b a:",
"ビィ/ b i:",
"ブゥ/ b u:",
"フャ/ hy a",
"ブュ/ by u",
"フョ/ hy o",
"ベェ/ b e:",
"ボォ/ b o:",
"パァ/ p a:",
"ピィ/ p i:",
"プゥ/ p u:",
"プャ/ py a",
"プュ/ py u",
"プョ/ py o",
"ペェ/ p e:",
"ポォ/ p o:",
"マァ/ m a:",
"ミィ/ m i:",
"ムゥ/ m u:",
"ムャ/ my a",
"ムュ/ my u",
"ムョ/ my o",
"メェ/ m e:",
"モォ/ m o:",
"ヤァ/ y a:",
"ユゥ/ y u:",
"ユャ/ y a:",
"ユュ/ y u:",
"ユョ/ y o:",
"ヨォ/ y o:",
"ラァ/ r a:",
"リィ/ r i:",
"ルゥ/ r u:",
"ルャ/ ry a",
"ルュ/ ry u",
"ルョ/ ry o",
"レェ/ r e:",
"ロォ/ r o:",
"ワァ/ w a:",
"ヲォ/ o:",
"ディ/ d i",
"デェ/ d e:",
"デャ/ dy a",
"デュ/ dy u",
"デョ/ dy o",
"ティ/ t i",
"テェ/ t e:",
"テャ/ ty a",
"テュ/ ty u",
"テョ/ ty o",
"スィ/ s i",
"ズァ/ z u a",
"ズィ/ z i",
"ズゥ/ z u",
"ズャ/ zy a",
"ズュ/ zy u",
"ズョ/ zy o",
"ズェ/ z e",
"ズォ/ z o",
"キャ/ ky a",
"キュ/ ky u",
"キョ/ ky o",
"シャ/ sh a",
"シュ/ sh u",
"シェ/ sh e",
"ショ/ sh o",
"チャ/ ch a",
"チュ/ ch u",
"チェ/ ch e",
"チョ/ ch o",
"トゥ/ t u",
"トャ/ ty a",
"トュ/ ty u",
"トョ/ ty o",
"ドァ/ d o a",
"ドゥ/ d u",
"ドャ/ dy a",
"ドュ/ dy u",
"ドョ/ dy o",
"ドォ/ d o:",
"ニャ/ ny a",
"ニュ/ ny u",
"ニョ/ ny o",
"ヒャ/ hy a",
"ヒュ/ hy u",
"ヒョ/ hy o",
"ミャ/ my a",
"ミュ/ my u",
"ミョ/ my o",
"リャ/ ry a",
"リュ/ ry u",
"リョ/ ry o",
"ギャ/ gy a",
"ギュ/ gy u",
"ギョ/ gy o",
"ヂェ/ j e",
"ヂャ/ j a",
"ヂュ/ j u",
"ヂョ/ j o",
"ジェ/ j e",
"ジャ/ j a",
"ジュ/ j u",
"ジョ/ j o",
"ビャ/ by a",
"ビュ/ by u",
"ビョ/ by o",
"ピャ/ py a",
"ピュ/ py u",
"ピョ/ py o",
"ウァ/ u a",
"ウィ/ w i",
"ウェ/ w e",
"ウォ/ w o",
"ファ/ f a",
"フィ/ f i",
"フゥ/ f u",
"フャ/ hy a",
"フュ/ hy u",
"フョ/ hy o",
"フェ/ f e",
"フォ/ f o",
"ヴァ/ b a",
"ヴィ/ b i",
"ヴェ/ b e",
"ヴォ/ b o",
"ヴュ/ by u",
# Conversion of 1 letter
"ア/ a",
"イ/ i",
"ウ/ u",
"エ/ e",
"オ/ o",
"カ/ k a",
"キ/ k i",
"ク/ k u",
"ケ/ k e",
"コ/ k o",
"サ/ s a",
"シ/ sh i",
"ス/ s u",
"セ/ s e",
"ソ/ s o",
"タ/ t a",
"チ/ ch i",
"ツ/ ts u",
"テ/ t e",
"ト/ t o",
"ナ/ n a",
"ニ/ n i",
"ヌ/ n u",
"ネ/ n e",
"/ n o",
"ハ/ h a",
"ヒ/ h i",
"フ/ f u",
"ヘ/ h e",
"ホ/ h o",
"マ/ m a",
"ミ/ m i",
"ム/ m u",
"メ/ m e",
"モ/ m o",
"ラ/ r a",
"リ/ r i",
"ル/ r u",
"レ/ r e",
"ロ/ r o",
"ガ/ g a",
"ギ/ g i",
"グ/ g u",
"ゲ/ g e",
"ゴ/ g o",
"ザ/ z a",
"ジ/ j i",
"ズ/ z u",
"ゼ/ z e",
"ゾ/ z o",
"ダ/ d a",
"ヂ/ j i",
"ヅ/ z u",
"デ/ d e",
"ド/ d o",
"バ/ b a",
"ビ/ b i",
"ブ/ b u",
"ベ/ b e",
"ボ/ b o",
"パ/ p a",
"ピ/ p i",
"プ/ p u",
"ペ/ p e",
"ポ/ p o",
"ヤ/ y a",
"ユ/ y u",
"ヨ/ y o",
"ワ/ w a",
"ヰ/ i",
"ヱ/ e",
"ヲ/ o",
"ン/ N",
"ッ/ q",
"ヴ/ b u",
"ー/:",
# Try converting broken text
"ァ/ a",
"ィ/ i",
"ゥ/ u",
"ェ/ e",
"ォ/ o",
"ヮ/ w a",
"ォ/ o",
# Symbols
"、/ ,",
"。/ .",
"/ !",
"/ ?",
"・/ ,",
]
_COLON_RX = re.compile(":+")
_REJECT_RX = re.compile("[^ a-zA-Z:,.?]")
def _makerulemap():
l = [tuple(x.split("/")) for x in _CONVRULES]
return tuple({k: v for k, v in l if len(k) == i} for i in (1, 2))
_RULEMAP1, _RULEMAP2 = _makerulemap()
def kata2phoneme(text: str) -> str:
"""Convert katakana text to phonemes."""
text = text.strip()
res = ""
while text:
if len(text) >= 2:
x = _RULEMAP2.get(text[:2])
if x is not None:
text = text[2:]
res += x
continue
x = _RULEMAP1.get(text[0])
if x is not None:
text = text[1:]
res += x
continue
res += " " + text[0]
text = text[1:]
res = _COLON_RX.sub(":", res)
return res[1:]
_KATAKANA = "".join(chr(ch) for ch in range(ord(""), ord("") + 1))
_HIRAGANA = "".join(chr(ch) for ch in range(ord(""), ord("") + 1))
_HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA)
def hira2kata(text: str) -> str:
text = text.translate(_HIRA2KATATRANS)
return text.replace("う゛", "")
_SYMBOL_TOKENS = set(list("・、。?!"))
_NO_YOMI_TOKENS = set(list("「」『』―()[][] …"))
_TAGGER = MeCab.Tagger()
def text2kata(text: str) -> str:
parsed = _TAGGER.parse(text)
res = []
for line in parsed.split("\n"):
if line == "EOS":
break
parts = line.split("\t")
word, yomi = parts[0], parts[1]
if yomi:
res.append(yomi)
else:
if word in _SYMBOL_TOKENS:
res.append(word)
elif word in ("", ""):
res.append("")
elif word in _NO_YOMI_TOKENS:
pass
else:
res.append(word)
return hira2kata("".join(res))
def japanese_text_to_phonemes(text: str) -> str:
"""Convert Japanese text to phonemes."""
res = text2kata(text)
res = kata2phoneme(res)
return res.replace(" ", "")

View File

@ -152,6 +152,7 @@ def process_args(args):
experiment_path = create_experiment_folder(config.output_path, config.run_name, args.debug) experiment_path = create_experiment_folder(config.output_path, config.run_name, args.debug)
audio_path = os.path.join(experiment_path, "test_audios") audio_path = os.path.join(experiment_path, "test_audios")
# setup rank 0 process in distributed training # setup rank 0 process in distributed training
tb_logger = None
if args.rank == 0: if args.rank == 0:
os.makedirs(audio_path, exist_ok=True) os.makedirs(audio_path, exist_ok=True)
new_fields = {} new_fields = {}

View File

@ -149,7 +149,7 @@ class ModelManager(object):
def _download_zip_file(file_url, output): def _download_zip_file(file_url, output):
"""Download the github releases""" """Download the github releases"""
r = requests.get(file_url) r = requests.get(file_url)
z = zipfile.ZipFile(io.BytesIO(r.content)) with zipfile.ZipFile(io.BytesIO(r.content)) as z:
z.extractall(output) z.extractall(output)
for file_path in z.namelist()[1:]: for file_path in z.namelist()[1:]:
src_path = os.path.join(output, file_path) src_path = os.path.join(output, file_path)

View File

@ -0,0 +1,23 @@
#!/bin/bash
# take the scripts's parent's directory to prefix all the output paths.
RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
CORPUS=kokoro-speech-v1_1-small
echo $RUN_DIR
if [ \! -d $RUN_DIR/$CORPUS ] ; then
echo "$RUN_DIR/$CORPUS doesn't exist."
echo "Follow the instruction of https://github.com/kaiidams/Kokoro-Speech-Dataset to make the corpus."
exit 1
fi
# create train-val splits
shuf $RUN_DIR/$CORPUS/metadata.csv > $RUN_DIR/$CORPUS/metadata_shuf.csv
head -n 8000 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_train.csv
tail -n 812 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_val.csv
# compute dataset mean and variance for normalization
python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/$CORPUS/wavs/
# training ....
# change the GPU id if needed
CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tacotron.py --config_path $RUN_DIR/tacotron2-DDC.json \
--coqpit.output_path $RUN_DIR \
--coqpit.datasets.0.path $RUN_DIR/$CORPUS \
--coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \
--coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \

View File

@ -0,0 +1,125 @@
{
"datasets": [
{
"name": "kokoro",
"path": "DEFINE THIS",
"meta_file_train": "metadata.csv",
"meta_file_val": null
}
],
"audio": {
"fft_size": 1024,
"win_length": 1024,
"hop_length": 256,
"frame_length_ms": null,
"frame_shift_ms": null,
"sample_rate": 22050,
"preemphasis": 0.0,
"ref_level_db": 20,
"do_trim_silence": true,
"trim_db": 60,
"power": 1.5,
"griffin_lim_iters": 60,
"num_mels": 80,
"mel_fmin": 50.0,
"mel_fmax": 7600.0,
"spec_gain": 1,
"signal_norm": true,
"min_level_db": -100,
"symmetric_norm": true,
"max_norm": 4.0,
"clip_norm": true,
"stats_path": "scale_stats.npy"
},
"gst":{
"gst_style_input": null,
"gst_embedding_dim": 512,
"gst_num_heads": 4,
"gst_style_tokens": 10,
"gst_use_speaker_embedding": false
},
"model": "Tacotron2",
"run_name": "kokoro-ddc",
"run_description": "tacotron2 with DDC and differential spectral loss.",
"batch_size": 32,
"eval_batch_size": 16,
"mixed_precision": true,
"distributed": {
"backend": "nccl",
"url": "tcp:\/\/localhost:54321"
},
"reinit_layers": [],
"loss_masking": true,
"decoder_loss_alpha": 0.5,
"postnet_loss_alpha": 0.25,
"postnet_diff_spec_alpha": 0.25,
"decoder_diff_spec_alpha": 0.25,
"decoder_ssim_alpha": 0.5,
"postnet_ssim_alpha": 0.25,
"ga_alpha": 5.0,
"stopnet_pos_weight": 15.0,
"run_eval": true,
"test_delay_epochs": 10,
"test_sentences_file": null,
"noam_schedule": false,
"grad_clip": 1.0,
"epochs": 1000,
"lr": 0.0001,
"wd": 0.000001,
"warmup_steps": 4000,
"seq_len_norm": false,
"memory_size": -1,
"prenet_type": "original",
"prenet_dropout": true,
"attention_type": "original",
"windowing": false,
"use_forward_attn": false,
"forward_attn_mask": false,
"transition_agent": false,
"location_attn": true,
"bidirectional_decoder": false,
"double_decoder_consistency": true,
"ddc_r": 7,
"attention_heads": 4,
"attention_norm": "sigmoid",
"r": 7,
"gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]],
"stopnet": true,
"separate_stopnet": true,
"print_step": 25,
"tb_plot_step": 100,
"print_eval": false,
"save_step": 10000,
"checkpoint": true,
"keep_all_best": false,
"keep_after": 10000,
"tb_model_param_stats": false,
"text_cleaner": "basic_cleaners",
"enable_eos_bos_chars": false,
"num_loader_workers": 4,
"num_val_loader_workers": 4,
"batch_group_size": 4,
"min_seq_len": 6,
"max_seq_len": 153,
"compute_input_seq_cache": false,
"use_noise_augment": true,
"output_path": "DEFINE THIS",
"phoneme_cache_path": "DEFINE THIS",
"use_phonemes": true,
"phoneme_language": "ja-jp",
"characters": {
"pad": "_",
"eos": "~",
"bos": "^",
"characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
"punctuations": "!'(),-.:;? ",
"phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
},
"use_speaker_embedding": false,
"use_gst": false,
"use_external_speaker_embedding_file": false,
"external_speaker_embedding_file": "../../speakers-vctk-en.json"
}

View File

@ -2,4 +2,4 @@ black
coverage coverage
isort isort
nose nose
pylint==2.7.4 pylint==2.8.3

View File

@ -17,5 +17,8 @@ torch>=1.7
tqdm tqdm
numba==0.52 numba==0.52
umap-learn==0.4.6 umap-learn==0.4.6
unidecode==0.4.20 anyascii
coqpit coqpit
# japanese g2p deps
mecab-python3==1.0.3
unidic-lite==1.0.8

View File

@ -4,7 +4,6 @@ import os
import subprocess import subprocess
import sys import sys
from distutils.version import LooseVersion from distutils.version import LooseVersion
from TTS._version import __version__
import numpy import numpy
import setuptools.command.build_py import setuptools.command.build_py
@ -12,82 +11,85 @@ import setuptools.command.develop
from Cython.Build import cythonize from Cython.Build import cythonize
from setuptools import Extension, find_packages, setup from setuptools import Extension, find_packages, setup
if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.9"): if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.9"):
raise RuntimeError( raise RuntimeError("TTS requires python >= 3.6 and <3.9 " "but your Python version is {}".format(sys.version))
"TTS requires python >= 3.6 and <3.9 "
"but your Python version is {}".format(sys.version)
)
version = __version__
cwd = os.path.dirname(os.path.abspath(__file__)) cwd = os.path.dirname(os.path.abspath(__file__))
cwd = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(cwd, "TTS", "VERSION")) as fin:
version = fin.read().strip()
class build_py(setuptools.command.build_py.build_py): # pylint: disable=too-many-ancestors class build_py(setuptools.command.build_py.build_py): # pylint: disable=too-many-ancestors
def run(self): def run(self):
self.create_version_file()
setuptools.command.build_py.build_py.run(self) setuptools.command.build_py.build_py.run(self)
@staticmethod
def create_version_file():
print('-- Building version ' + version)
version_path = os.path.join(cwd, 'version.py')
with open(version_path, 'w') as f:
f.write("__version__ = '{}'\n".format(version))
class develop(setuptools.command.develop.develop): class develop(setuptools.command.develop.develop):
def run(self): def run(self):
build_py.create_version_file()
setuptools.command.develop.develop.run(self) setuptools.command.develop.develop.run(self)
# The documentation for this feature is in server/README.md # The documentation for this feature is in server/README.md
package_data = ['TTS/server/templates/*'] package_data = ["TTS/server/templates/*"]
def pip_install(package_name): def pip_install(package_name):
subprocess.call([sys.executable, '-m', 'pip', 'install', package_name]) subprocess.call([sys.executable, "-m", "pip", "install", package_name])
requirements = open(os.path.join(cwd, 'requirements.txt'), 'r').readlines() requirements = open(os.path.join(cwd, "requirements.txt"), "r").readlines()
with open(os.path.join(cwd, 'requirements.notebooks.txt'), 'r') as f: with open(os.path.join(cwd, "requirements.notebooks.txt"), "r") as f:
requirements_notebooks = f.readlines() requirements_notebooks = f.readlines()
with open(os.path.join(cwd, 'requirements.dev.txt'), 'r') as f: with open(os.path.join(cwd, "requirements.dev.txt"), "r") as f:
requirements_dev = f.readlines() requirements_dev = f.readlines()
with open(os.path.join(cwd, 'requirements.tf.txt'), 'r') as f: with open(os.path.join(cwd, "requirements.tf.txt"), "r") as f:
requirements_tf = f.readlines() requirements_tf = f.readlines()
requirements_all = requirements_dev + requirements_notebooks + requirements_tf requirements_all = requirements_dev + requirements_notebooks + requirements_tf
with open('README.md', "r", encoding="utf-8") as readme_file: with open("README.md", "r", encoding="utf-8") as readme_file:
README = readme_file.read() README = readme_file.read()
exts = [Extension(name='TTS.tts.layers.glow_tts.monotonic_align.core', exts = [
sources=["TTS/tts/layers/glow_tts/monotonic_align/core.pyx"])] Extension(
name="TTS.tts.layers.glow_tts.monotonic_align.core",
sources=["TTS/tts/layers/glow_tts/monotonic_align/core.pyx"],
)
]
setup( setup(
name='TTS', name="TTS",
version=version, version=version,
url='https://github.com/coqui-ai/TTS', url="https://github.com/coqui-ai/TTS",
author='Eren Gölge', author="Eren Gölge",
author_email='egolge@coqui.ai', author_email="egolge@coqui.ai",
description='Deep learning for Text to Speech by Coqui.', description="Deep learning for Text to Speech by Coqui.",
long_description=README, long_description=README,
long_description_content_type="text/markdown", long_description_content_type="text/markdown",
license='MPL-2.0', license="MPL-2.0",
# cython # cython
include_dirs=numpy.get_include(), include_dirs=numpy.get_include(),
ext_modules=cythonize(exts, language_level=3), ext_modules=cythonize(exts, language_level=3),
# ext_modules=find_cython_extensions(), # ext_modules=find_cython_extensions(),
# package # package
include_package_data=True, include_package_data=True,
packages=find_packages(include=['TTS*']), packages=find_packages(include=["TTS*"]),
package_data={
"TTS": [
"VERSION",
]
},
project_urls={ project_urls={
'Documentation': 'https://github.com/coqui-ai/TTS/wiki', "Documentation": "https://github.com/coqui-ai/TTS/wiki",
'Tracker': 'https://github.com/coqui-ai/TTS/issues', "Tracker": "https://github.com/coqui-ai/TTS/issues",
'Repository': 'https://github.com/coqui-ai/TTS', "Repository": "https://github.com/coqui-ai/TTS",
'Discussions': 'https://github.com/coqui-ai/TTS/discussions', "Discussions": "https://github.com/coqui-ai/TTS/discussions",
}, },
cmdclass={ cmdclass={
'build_py': build_py, "build_py": build_py,
'develop': develop, "develop": develop,
# 'build_ext': build_ext # 'build_ext': build_ext
}, },
install_requires=requirements, install_requires=requirements,
@ -97,30 +99,25 @@ setup(
"notebooks": requirements_notebooks, "notebooks": requirements_notebooks,
"tf": requirements_tf, "tf": requirements_tf,
}, },
python_requires='>=3.6.0, <3.9', python_requires=">=3.6.0, <3.9",
entry_points={ entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]},
'console_scripts': [
'tts=TTS.bin.synthesize:main',
'tts-server = TTS.server.server:main'
]
},
classifiers=[ classifiers=[
"Programming Language :: Python", "Programming Language :: Python",
"Programming Language :: Python :: 3", "Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.8",
'Development Status :: 3 - Alpha', "Development Status :: 3 - Alpha",
"Intended Audience :: Science/Research", "Intended Audience :: Science/Research",
"Intended Audience :: Developers", "Intended Audience :: Developers",
"Operating System :: POSIX :: Linux", "Operating System :: POSIX :: Linux",
'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)', "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)",
"Topic :: Software Development", "Topic :: Software Development",
"Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: Multimedia :: Sound/Audio :: Speech", "Topic :: Multimedia :: Sound/Audio :: Speech",
"Topic :: Multimedia :: Sound/Audio", "Topic :: Multimedia :: Sound/Audio",
"Topic :: Multimedia", "Topic :: Multimedia",
"Topic :: Scientific/Engineering :: Artificial Intelligence" "Topic :: Scientific/Engineering :: Artificial Intelligence",
], ],
zip_safe=False zip_safe=False,
) )

View File

@ -6,6 +6,7 @@ from tests import get_tests_input_path
from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder
from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder
file_path = get_tests_input_path() file_path = get_tests_input_path()
@ -39,6 +40,7 @@ class LSTMSpeakerEncoderTests(unittest.TestCase):
assert output.shape[1] == 256 assert output.shape[1] == 256
assert len(output.shape) == 2 assert len(output.shape) == 2
class ResNetSpeakerEncoderTests(unittest.TestCase): class ResNetSpeakerEncoderTests(unittest.TestCase):
# pylint: disable=R0201 # pylint: disable=R0201
def test_in_out(self): def test_in_out(self):
@ -65,6 +67,7 @@ class ResNetSpeakerEncoderTests(unittest.TestCase):
assert output.shape[1] == 256 assert output.shape[1] == 256
assert len(output.shape) == 2 assert len(output.shape) == 2
class GE2ELossTests(unittest.TestCase): class GE2ELossTests(unittest.TestCase):
# pylint: disable=R0201 # pylint: disable=R0201
def test_in_out(self): def test_in_out(self):
@ -92,6 +95,7 @@ class GE2ELossTests(unittest.TestCase):
output = loss.forward(dummy_input) output = loss.forward(dummy_input)
assert output.item() < 0.005 assert output.item() < 0.005
class AngleProtoLossTests(unittest.TestCase): class AngleProtoLossTests(unittest.TestCase):
# pylint: disable=R0201 # pylint: disable=R0201
def test_in_out(self): def test_in_out(self):
@ -121,6 +125,7 @@ class AngleProtoLossTests(unittest.TestCase):
output = loss.forward(dummy_input) output = loss.forward(dummy_input)
assert output.item() < 0.005 assert output.item() < 0.005
class SoftmaxAngleProtoLossTests(unittest.TestCase): class SoftmaxAngleProtoLossTests(unittest.TestCase):
# pylint: disable=R0201 # pylint: disable=R0201
def test_in_out(self): def test_in_out(self):

View File

@ -46,7 +46,7 @@ run_cli(command_train)
shutil.rmtree(continue_path) shutil.rmtree(continue_path)
# test resnet speaker encoder # test resnet speaker encoder
config.model_params['model_name'] = "resnet" config.model_params["model_name"] = "resnet"
config.save_json(config_path) config.save_json(config_path)
# train the model for one epoch # train the model for one epoch

View File

@ -0,0 +1,24 @@
import unittest
from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
_TEST_CASES = """
どちらに行きますか/dochiraniikimasuka?
今日は温泉に行きます/kyo:waoNseNni,ikimasu.
AからZまでです/AkaraZmadedesu.
そうですね/so:desune!
クジラは哺乳類です/kujirawahonyu:ruidesu.
ヴィディオを見ます/bidioomimasu.
ky o: w a o N s e N n i , i k i m a s u ./kyo:waoNseNni,ikimasu.
"""
class TestText(unittest.TestCase):
def test_japanese_text_to_phonemes(self):
for line in _TEST_CASES.strip().split("\n"):
text, phone = line.split("/")
self.assertEqual(japanese_text_to_phonemes(text), phone)
if __name__ == "__main__":
unittest.main()

View File

@ -17,7 +17,7 @@ config = GlowTTSConfig(
text_cleaner="english_cleaners", text_cleaner="english_cleaners",
use_phonemes=True, use_phonemes=True,
phoneme_language="zh-CN", phoneme_language="zh-CN",
phoneme_cache_path='tests/data/ljspeech/phoneme_cache/', phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
run_eval=True, run_eval=True,
test_delay_epochs=-1, test_delay_epochs=-1,
epochs=1, epochs=1,

View File

@ -17,7 +17,7 @@ config = SpeedySpeechConfig(
text_cleaner="english_cleaners", text_cleaner="english_cleaners",
use_phonemes=True, use_phonemes=True,
phoneme_language="zh-CN", phoneme_language="zh-CN",
phoneme_cache_path='tests/data/ljspeech/phoneme_cache/', phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
run_eval=True, run_eval=True,
test_delay_epochs=-1, test_delay_epochs=-1,
epochs=1, epochs=1,

View File

@ -20,6 +20,7 @@ config = FullbandMelganConfig(
eval_split_size=1, eval_split_size=1,
print_step=1, print_step=1,
print_eval=True, print_eval=True,
discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]},
data_path="tests/data/ljspeech", data_path="tests/data/ljspeech",
output_path=output_path, output_path=output_path,
) )

View File

@ -19,6 +19,7 @@ config = MelganConfig(
seq_len=2048, seq_len=2048,
eval_split_size=1, eval_split_size=1,
print_step=1, print_step=1,
discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]},
print_eval=True, print_eval=True,
data_path="tests/data/ljspeech", data_path="tests/data/ljspeech",
output_path=output_path, output_path=output_path,