mirror of https://github.com/coqui-ai/TTS.git
fix Lint checks
This commit is contained in:
commit
28bec238ca
|
@ -0,0 +1,38 @@
|
||||||
|
name: Publish Python 🐍 distributions 📦 to PyPI
|
||||||
|
on:
|
||||||
|
release:
|
||||||
|
types: [published]
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell:
|
||||||
|
bash
|
||||||
|
jobs:
|
||||||
|
build-package:
|
||||||
|
runs-on: ubuntu-20.04
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- name: Verify tag matches version
|
||||||
|
run: |
|
||||||
|
set -ex
|
||||||
|
version=$(cat TTS/VERSION)
|
||||||
|
tag="${GITHUB_REF/refs\/tags\/}"
|
||||||
|
if [[ "v$version" != "$tag" ]]; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
- uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: 3.8
|
||||||
|
- run: |
|
||||||
|
python -m pip install -U pip setuptools twine toml
|
||||||
|
python -c 'import toml; c = toml.load("pyproject.toml"); print("\n".join(c["build-system"]["requires"]))' | pip install -r /dev/stdin
|
||||||
|
- run: |
|
||||||
|
python setup.py sdist
|
||||||
|
- name: Setup PyPI config
|
||||||
|
run: |
|
||||||
|
cat << EOF > ~/.pypirc
|
||||||
|
[pypi]
|
||||||
|
username=__token__
|
||||||
|
password=${{ secrets.PYPI_TOKEN }}
|
||||||
|
EOF
|
||||||
|
- run: |
|
||||||
|
twine upload --repository pypi dist/*.tar.gz
|
|
@ -158,7 +158,8 @@ disable=missing-docstring,
|
||||||
deprecated-sys-function,
|
deprecated-sys-function,
|
||||||
exception-escape,
|
exception-escape,
|
||||||
comprehension-escape,
|
comprehension-escape,
|
||||||
duplicate-code
|
duplicate-code,
|
||||||
|
not-callable
|
||||||
|
|
||||||
# Enable the message, report, category or checker with the given id(s). You can
|
# Enable the message, report, category or checker with the given id(s). You can
|
||||||
# either give multiple identifier separated by comma (,) or put this option
|
# either give multiple identifier separated by comma (,) or put this option
|
||||||
|
@ -253,7 +254,7 @@ contextmanager-decorators=contextlib.contextmanager
|
||||||
# List of members which are set dynamically and missed by pylint inference
|
# List of members which are set dynamically and missed by pylint inference
|
||||||
# system, and so shouldn't trigger E1101 when accessed. Python regular
|
# system, and so shouldn't trigger E1101 when accessed. Python regular
|
||||||
# expressions are accepted.
|
# expressions are accepted.
|
||||||
generated-members=
|
generated-members=numpy.*,torch.*
|
||||||
|
|
||||||
# Tells whether missing members accessed in mixin class should be ignored. A
|
# Tells whether missing members accessed in mixin class should be ignored. A
|
||||||
# mixin class is detected if its name ends with "mixin" (case insensitive).
|
# mixin class is detected if its name ends with "mixin" (case insensitive).
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
include README.md
|
include README.md
|
||||||
include LICENSE.txt
|
include LICENSE.txt
|
||||||
include requirements.*.txt
|
include requirements.*.txt
|
||||||
|
include TTS/VERSION
|
||||||
recursive-include TTS *.json
|
recursive-include TTS *.json
|
||||||
recursive-include TTS *.html
|
recursive-include TTS *.html
|
||||||
recursive-include TTS *.png
|
recursive-include TTS *.png
|
||||||
|
|
|
@ -149,6 +149,18 @@
|
||||||
"needs_phonemizer": true
|
"needs_phonemizer": true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"ja":{
|
||||||
|
"kokoro":{
|
||||||
|
"tacotron2-DDC":{
|
||||||
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.15/tts_models--jp--kokoro--tacotron2-DDC.zip",
|
||||||
|
"default_vocoder": "vocoder_models/universal/libri-tts/wavegrad",
|
||||||
|
"description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
|
||||||
|
"author": "@kaiidams",
|
||||||
|
"commit": "401fbd89",
|
||||||
|
"needs_phonemizer": false
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"vocoder_models":{
|
"vocoder_models":{
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
0.0.15
|
|
@ -1 +1,7 @@
|
||||||
from ._version import __version__
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
with open(os.path.join(os.path.dirname(__file__), "VERSION")) as f:
|
||||||
|
version = f.read().strip()
|
||||||
|
|
||||||
|
__version__ = version
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
__version__ = "0.0.14"
|
|
|
@ -6,12 +6,12 @@ import numpy as np
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
|
|
||||||
|
from TTS.config import BaseDatasetConfig, load_config
|
||||||
from TTS.speaker_encoder.utils.generic_utils import setup_model
|
from TTS.speaker_encoder.utils.generic_utils import setup_model
|
||||||
from TTS.tts.datasets.preprocess import load_meta_data
|
from TTS.tts.datasets.preprocess import load_meta_data
|
||||||
from TTS.tts.utils.speakers import SpeakerManager
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description='Compute embedding vectors for each wav file in a dataset.'
|
description='Compute embedding vectors for each wav file in a dataset.'
|
||||||
)
|
)
|
||||||
|
@ -74,6 +74,7 @@ for idx, wav_file in enumerate(tqdm(wav_files)):
|
||||||
if speaker_mapping:
|
if speaker_mapping:
|
||||||
# save speaker_mapping if target dataset is defined
|
# save speaker_mapping if target dataset is defined
|
||||||
if '.json' not in args.output_path and '.npy' not in args.output_path:
|
if '.json' not in args.output_path and '.npy' not in args.output_path:
|
||||||
|
|
||||||
mapping_file_path = os.path.join(args.output_path, "speakers.json")
|
mapping_file_path = os.path.join(args.output_path, "speakers.json")
|
||||||
mapping_npy_file_path = os.path.join(args.output_path, "speakers.npy")
|
mapping_npy_file_path = os.path.join(args.output_path, "speakers.npy")
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -51,7 +51,7 @@ def main():
|
||||||
my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
|
my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
|
||||||
command[-1] = "--rank={}".format(i)
|
command[-1] = "--rank={}".format(i)
|
||||||
stdout = None if i == 0 else open(os.devnull, "w")
|
stdout = None if i == 0 else open(os.devnull, "w")
|
||||||
p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env)
|
p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env) # pylint: disable=consider-using-with
|
||||||
processes.append(p)
|
processes.append(p)
|
||||||
print(command)
|
print(command)
|
||||||
|
|
||||||
|
|
|
@ -299,4 +299,5 @@ if __name__ == "__main__":
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
c = load_config(args.config_path)
|
c = load_config(args.config_path)
|
||||||
|
c.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel
|
||||||
main(args)
|
main(args)
|
||||||
|
|
|
@ -10,10 +10,8 @@ import torch
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
from TTS.speaker_encoder.dataset import SpeakerEncoderDataset
|
from TTS.speaker_encoder.dataset import SpeakerEncoderDataset
|
||||||
|
|
||||||
from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
|
from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
|
||||||
from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model
|
from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model
|
||||||
|
|
||||||
from TTS.speaker_encoder.utils.visual import plot_embeddings
|
from TTS.speaker_encoder.utils.visual import plot_embeddings
|
||||||
from TTS.tts.datasets.preprocess import load_meta_data
|
from TTS.tts.datasets.preprocess import load_meta_data
|
||||||
from TTS.utils.arguments import init_training
|
from TTS.utils.arguments import init_training
|
||||||
|
@ -45,7 +43,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False
|
||||||
storage_size=c.storage["storage_size"],
|
storage_size=c.storage["storage_size"],
|
||||||
sample_from_storage_p=c.storage["sample_from_storage_p"],
|
sample_from_storage_p=c.storage["sample_from_storage_p"],
|
||||||
verbose=verbose,
|
verbose=verbose,
|
||||||
augmentation_config=c.audio_augmentation
|
augmentation_config=c.audio_augmentation,
|
||||||
)
|
)
|
||||||
|
|
||||||
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None
|
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None
|
||||||
|
@ -170,19 +168,18 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
else:
|
else:
|
||||||
raise Exception("The %s not is a loss supported" % c.loss)
|
raise Exception("The %s not is a loss supported" % c.loss)
|
||||||
|
|
||||||
|
|
||||||
if args.restore_path:
|
if args.restore_path:
|
||||||
checkpoint = torch.load(args.restore_path)
|
checkpoint = torch.load(args.restore_path)
|
||||||
try:
|
try:
|
||||||
model.load_state_dict(checkpoint["model"])
|
model.load_state_dict(checkpoint["model"])
|
||||||
|
|
||||||
if 'criterion' in checkpoint:
|
if "criterion" in checkpoint:
|
||||||
criterion.load_state_dict(checkpoint["criterion"])
|
criterion.load_state_dict(checkpoint["criterion"])
|
||||||
|
|
||||||
except (KeyError, RuntimeError):
|
except (KeyError, RuntimeError):
|
||||||
print(" > Partial model initialization.")
|
print(" > Partial model initialization.")
|
||||||
model_dict = model.state_dict()
|
model_dict = model.state_dict()
|
||||||
model_dict = set_init_dict(model_dict, checkpoint['model'], c)
|
model_dict = set_init_dict(model_dict, checkpoint["model"], c)
|
||||||
model.load_state_dict(model_dict)
|
model.load_state_dict(model_dict)
|
||||||
del model_dict
|
del model_dict
|
||||||
for group in optimizer.param_groups:
|
for group in optimizer.param_groups:
|
||||||
|
|
|
@ -99,7 +99,9 @@ if args.vocoder_path is not None:
|
||||||
vocoder_config_path = args.vocoder_config_path
|
vocoder_config_path = args.vocoder_config_path
|
||||||
|
|
||||||
# load models
|
# load models
|
||||||
synthesizer = Synthesizer(model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, args.use_cuda)
|
synthesizer = Synthesizer(
|
||||||
|
model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, use_cuda=args.use_cuda
|
||||||
|
)
|
||||||
|
|
||||||
use_multi_speaker = synthesizer.speaker_manager is not None
|
use_multi_speaker = synthesizer.speaker_manager is not None
|
||||||
# TODO: set this from SpeakerManager
|
# TODO: set this from SpeakerManager
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
|
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from torch.utils.data import Dataset
|
from torch.utils.data import Dataset
|
||||||
|
|
||||||
from TTS.speaker_encoder.utils.generic_utils import AugmentWAV, Storage
|
from TTS.speaker_encoder.utils.generic_utils import AugmentWAV, Storage
|
||||||
|
|
||||||
|
|
||||||
class SpeakerEncoderDataset(Dataset):
|
class SpeakerEncoderDataset(Dataset):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -18,7 +19,7 @@ class SpeakerEncoderDataset(Dataset):
|
||||||
num_utter_per_speaker=10,
|
num_utter_per_speaker=10,
|
||||||
skip_speakers=False,
|
skip_speakers=False,
|
||||||
verbose=False,
|
verbose=False,
|
||||||
augmentation_config=None
|
augmentation_config=None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
|
@ -38,7 +39,9 @@ class SpeakerEncoderDataset(Dataset):
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.__parse_items()
|
self.__parse_items()
|
||||||
storage_max_size = storage_size * num_speakers_in_batch
|
storage_max_size = storage_size * num_speakers_in_batch
|
||||||
self.storage = Storage(maxsize=storage_max_size, storage_batchs=storage_size, num_speakers_in_batch=num_speakers_in_batch)
|
self.storage = Storage(
|
||||||
|
maxsize=storage_max_size, storage_batchs=storage_size, num_speakers_in_batch=num_speakers_in_batch
|
||||||
|
)
|
||||||
self.sample_from_storage_p = float(sample_from_storage_p)
|
self.sample_from_storage_p = float(sample_from_storage_p)
|
||||||
|
|
||||||
speakers_aux = list(self.speakers)
|
speakers_aux = list(self.speakers)
|
||||||
|
@ -49,12 +52,12 @@ class SpeakerEncoderDataset(Dataset):
|
||||||
self.augmentator = None
|
self.augmentator = None
|
||||||
self.gaussian_augmentation_config = None
|
self.gaussian_augmentation_config = None
|
||||||
if augmentation_config:
|
if augmentation_config:
|
||||||
self.data_augmentation_p = augmentation_config['p']
|
self.data_augmentation_p = augmentation_config["p"]
|
||||||
if self.data_augmentation_p and ('additive' in augmentation_config or 'rir' in augmentation_config):
|
if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
|
||||||
self.augmentator = AugmentWAV(ap, augmentation_config)
|
self.augmentator = AugmentWAV(ap, augmentation_config)
|
||||||
|
|
||||||
if 'gaussian' in augmentation_config.keys():
|
if "gaussian" in augmentation_config.keys():
|
||||||
self.gaussian_augmentation_config = augmentation_config['gaussian']
|
self.gaussian_augmentation_config = augmentation_config["gaussian"]
|
||||||
|
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print("\n > DataLoader initialization")
|
print("\n > DataLoader initialization")
|
||||||
|
@ -231,9 +234,13 @@ class SpeakerEncoderDataset(Dataset):
|
||||||
offset = random.randint(0, wav.shape[0] - self.seq_len)
|
offset = random.randint(0, wav.shape[0] - self.seq_len)
|
||||||
wav = wav[offset : offset + self.seq_len]
|
wav = wav[offset : offset + self.seq_len]
|
||||||
# add random gaussian noise
|
# add random gaussian noise
|
||||||
if self.gaussian_augmentation_config and self.gaussian_augmentation_config['p']:
|
if self.gaussian_augmentation_config and self.gaussian_augmentation_config["p"]:
|
||||||
if random.random() < self.gaussian_augmentation_config['p']:
|
if random.random() < self.gaussian_augmentation_config["p"]:
|
||||||
wav += np.random.normal(self.gaussian_augmentation_config['min_amplitude'], self.gaussian_augmentation_config['max_amplitude'], size=len(wav))
|
wav += np.random.normal(
|
||||||
|
self.gaussian_augmentation_config["min_amplitude"],
|
||||||
|
self.gaussian_augmentation_config["max_amplitude"],
|
||||||
|
size=len(wav),
|
||||||
|
)
|
||||||
mel = self.ap.melspectrogram(wav)
|
mel = self.ap.melspectrogram(wav)
|
||||||
feats_.append(torch.FloatTensor(mel))
|
feats_.append(torch.FloatTensor(mel))
|
||||||
|
|
||||||
|
|
|
@ -162,6 +162,7 @@ class AngleProtoLoss(nn.Module):
|
||||||
L = self.criterion(cos_sim_matrix, label)
|
L = self.criterion(cos_sim_matrix, label)
|
||||||
return L
|
return L
|
||||||
|
|
||||||
|
|
||||||
class SoftmaxLoss(nn.Module):
|
class SoftmaxLoss(nn.Module):
|
||||||
"""
|
"""
|
||||||
Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982
|
Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982
|
||||||
|
@ -169,13 +170,14 @@ class SoftmaxLoss(nn.Module):
|
||||||
- embedding_dim (float): speaker embedding dim
|
- embedding_dim (float): speaker embedding dim
|
||||||
- n_speakers (float): number of speakers
|
- n_speakers (float): number of speakers
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, embedding_dim, n_speakers):
|
def __init__(self, embedding_dim, n_speakers):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.criterion = torch.nn.CrossEntropyLoss()
|
self.criterion = torch.nn.CrossEntropyLoss()
|
||||||
self.fc = nn.Linear(embedding_dim, n_speakers)
|
self.fc = nn.Linear(embedding_dim, n_speakers)
|
||||||
|
|
||||||
print('Initialised Softmax Loss')
|
print("Initialised Softmax Loss")
|
||||||
|
|
||||||
def forward(self, x, label=None):
|
def forward(self, x, label=None):
|
||||||
# reshape for compatibility
|
# reshape for compatibility
|
||||||
|
@ -187,6 +189,7 @@ class SoftmaxLoss(nn.Module):
|
||||||
|
|
||||||
return L
|
return L
|
||||||
|
|
||||||
|
|
||||||
class SoftmaxAngleProtoLoss(nn.Module):
|
class SoftmaxAngleProtoLoss(nn.Module):
|
||||||
"""
|
"""
|
||||||
Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
|
Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
|
||||||
|
@ -196,13 +199,14 @@ class SoftmaxAngleProtoLoss(nn.Module):
|
||||||
- init_w (float): defines the initial value of w
|
- init_w (float): defines the initial value of w
|
||||||
- init_b (float): definies the initial value of b
|
- init_b (float): definies the initial value of b
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0):
|
def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
|
self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
|
||||||
self.angleproto = AngleProtoLoss(init_w, init_b)
|
self.angleproto = AngleProtoLoss(init_w, init_b)
|
||||||
|
|
||||||
print('Initialised SoftmaxAnglePrototypical Loss')
|
print("Initialised SoftmaxAnglePrototypical Loss")
|
||||||
|
|
||||||
def forward(self, x, label=None):
|
def forward(self, x, label=None):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
import torch
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
|
|
||||||
class SELayer(nn.Module):
|
class SELayer(nn.Module):
|
||||||
def __init__(self, channel, reduction=8):
|
def __init__(self, channel, reduction=8):
|
||||||
super(SELayer, self).__init__()
|
super(SELayer, self).__init__()
|
||||||
|
@ -10,7 +11,7 @@ class SELayer(nn.Module):
|
||||||
nn.Linear(channel, channel // reduction),
|
nn.Linear(channel, channel // reduction),
|
||||||
nn.ReLU(inplace=True),
|
nn.ReLU(inplace=True),
|
||||||
nn.Linear(channel // reduction, channel),
|
nn.Linear(channel // reduction, channel),
|
||||||
nn.Sigmoid()
|
nn.Sigmoid(),
|
||||||
)
|
)
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
|
@ -19,6 +20,7 @@ class SELayer(nn.Module):
|
||||||
y = self.fc(y).view(b, c, 1, 1)
|
y = self.fc(y).view(b, c, 1, 1)
|
||||||
return x * y
|
return x * y
|
||||||
|
|
||||||
|
|
||||||
class SEBasicBlock(nn.Module):
|
class SEBasicBlock(nn.Module):
|
||||||
expansion = 1
|
expansion = 1
|
||||||
|
|
||||||
|
@ -51,12 +53,22 @@ class SEBasicBlock(nn.Module):
|
||||||
out = self.relu(out)
|
out = self.relu(out)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
class ResNetSpeakerEncoder(nn.Module):
|
class ResNetSpeakerEncoder(nn.Module):
|
||||||
"""Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
|
"""Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
|
||||||
Adapted from: https://github.com/clovaai/voxceleb_trainer
|
Adapted from: https://github.com/clovaai/voxceleb_trainer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# pylint: disable=W0102
|
# pylint: disable=W0102
|
||||||
def __init__(self, input_dim=64, proj_dim=512, layers=[3, 4, 6, 3], num_filters=[32, 64, 128, 256], encoder_type='ASP', log_input=False):
|
def __init__(
|
||||||
|
self,
|
||||||
|
input_dim=64,
|
||||||
|
proj_dim=512,
|
||||||
|
layers=[3, 4, 6, 3],
|
||||||
|
num_filters=[32, 64, 128, 256],
|
||||||
|
encoder_type="ASP",
|
||||||
|
log_input=False,
|
||||||
|
):
|
||||||
super(ResNetSpeakerEncoder, self).__init__()
|
super(ResNetSpeakerEncoder, self).__init__()
|
||||||
|
|
||||||
self.encoder_type = encoder_type
|
self.encoder_type = encoder_type
|
||||||
|
@ -89,7 +101,7 @@ class ResNetSpeakerEncoder(nn.Module):
|
||||||
elif self.encoder_type == "ASP":
|
elif self.encoder_type == "ASP":
|
||||||
out_dim = num_filters[3] * outmap_size * 2
|
out_dim = num_filters[3] * outmap_size * 2
|
||||||
else:
|
else:
|
||||||
raise ValueError('Undefined encoder')
|
raise ValueError("Undefined encoder")
|
||||||
|
|
||||||
self.fc = nn.Linear(out_dim, proj_dim)
|
self.fc = nn.Linear(out_dim, proj_dim)
|
||||||
|
|
||||||
|
@ -98,7 +110,7 @@ class ResNetSpeakerEncoder(nn.Module):
|
||||||
def _init_layers(self):
|
def _init_layers(self):
|
||||||
for m in self.modules():
|
for m in self.modules():
|
||||||
if isinstance(m, nn.Conv2d):
|
if isinstance(m, nn.Conv2d):
|
||||||
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
|
nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
|
||||||
elif isinstance(m, nn.BatchNorm2d):
|
elif isinstance(m, nn.BatchNorm2d):
|
||||||
nn.init.constant_(m.weight, 1)
|
nn.init.constant_(m.weight, 1)
|
||||||
nn.init.constant_(m.bias, 0)
|
nn.init.constant_(m.bias, 0)
|
||||||
|
@ -107,8 +119,7 @@ class ResNetSpeakerEncoder(nn.Module):
|
||||||
downsample = None
|
downsample = None
|
||||||
if stride != 1 or self.inplanes != planes * block.expansion:
|
if stride != 1 or self.inplanes != planes * block.expansion:
|
||||||
downsample = nn.Sequential(
|
downsample = nn.Sequential(
|
||||||
nn.Conv2d(self.inplanes, planes * block.expansion,
|
nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
|
||||||
kernel_size=1, stride=stride, bias=False),
|
|
||||||
nn.BatchNorm2d(planes * block.expansion),
|
nn.BatchNorm2d(planes * block.expansion),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -25,10 +25,7 @@ class SpeakerEncoderConfig(BaseTrainingConfig):
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
audio_augmentation : dict = field(
|
audio_augmentation: dict = field(default_factory=lambda: {})
|
||||||
default_factory=lambda: {
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
storage: dict = field(
|
storage: dict = field(
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {
|
||||||
|
|
|
@ -1,18 +1,18 @@
|
||||||
import re
|
import datetime
|
||||||
|
import glob
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
|
import re
|
||||||
|
from multiprocessing import Manager
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
import glob
|
|
||||||
import random
|
|
||||||
import datetime
|
|
||||||
|
|
||||||
from scipy import signal
|
from scipy import signal
|
||||||
from multiprocessing import Manager
|
|
||||||
|
|
||||||
from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder
|
from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder
|
||||||
from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder
|
from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder
|
||||||
|
|
||||||
|
|
||||||
class Storage(object):
|
class Storage(object):
|
||||||
def __init__(self, maxsize, storage_batchs, num_speakers_in_batch, num_threads=8):
|
def __init__(self, maxsize, storage_batchs, num_speakers_in_batch, num_threads=8):
|
||||||
# use multiprocessing for threading safe
|
# use multiprocessing for threading safe
|
||||||
|
@ -53,19 +53,19 @@ class Storage(object):
|
||||||
return self.storage[random.randint(0, storage_size)]
|
return self.storage[random.randint(0, storage_size)]
|
||||||
|
|
||||||
def get_random_sample_fast(self):
|
def get_random_sample_fast(self):
|
||||||
'''Call this method only when storage is full'''
|
"""Call this method only when storage is full"""
|
||||||
return self.storage[random.randint(0, self.safe_storage_size)]
|
return self.storage[random.randint(0, self.safe_storage_size)]
|
||||||
|
|
||||||
class AugmentWAV(object):
|
|
||||||
|
|
||||||
|
class AugmentWAV(object):
|
||||||
def __init__(self, ap, augmentation_config):
|
def __init__(self, ap, augmentation_config):
|
||||||
|
|
||||||
self.ap = ap
|
self.ap = ap
|
||||||
self.use_additive_noise = False
|
self.use_additive_noise = False
|
||||||
|
|
||||||
if 'additive' in augmentation_config.keys():
|
if "additive" in augmentation_config.keys():
|
||||||
self.additive_noise_config = augmentation_config['additive']
|
self.additive_noise_config = augmentation_config["additive"]
|
||||||
additive_path = self.additive_noise_config['sounds_path']
|
additive_path = self.additive_noise_config["sounds_path"]
|
||||||
if additive_path:
|
if additive_path:
|
||||||
self.use_additive_noise = True
|
self.use_additive_noise = True
|
||||||
# get noise types
|
# get noise types
|
||||||
|
@ -74,12 +74,12 @@ class AugmentWAV(object):
|
||||||
if isinstance(self.additive_noise_config[key], dict):
|
if isinstance(self.additive_noise_config[key], dict):
|
||||||
self.additive_noise_types.append(key)
|
self.additive_noise_types.append(key)
|
||||||
|
|
||||||
additive_files = glob.glob(os.path.join(additive_path, '**/*.wav'), recursive=True)
|
additive_files = glob.glob(os.path.join(additive_path, "**/*.wav"), recursive=True)
|
||||||
|
|
||||||
self.noise_list = {}
|
self.noise_list = {}
|
||||||
|
|
||||||
for wav_file in additive_files:
|
for wav_file in additive_files:
|
||||||
noise_dir = wav_file.replace(additive_path, '').split(os.sep)[0]
|
noise_dir = wav_file.replace(additive_path, "").split(os.sep)[0]
|
||||||
# ignore not listed directories
|
# ignore not listed directories
|
||||||
if noise_dir not in self.additive_noise_types:
|
if noise_dir not in self.additive_noise_types:
|
||||||
continue
|
continue
|
||||||
|
@ -87,14 +87,16 @@ class AugmentWAV(object):
|
||||||
self.noise_list[noise_dir] = []
|
self.noise_list[noise_dir] = []
|
||||||
self.noise_list[noise_dir].append(wav_file)
|
self.noise_list[noise_dir].append(wav_file)
|
||||||
|
|
||||||
print(f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}")
|
print(
|
||||||
|
f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}"
|
||||||
|
)
|
||||||
|
|
||||||
self.use_rir = False
|
self.use_rir = False
|
||||||
|
|
||||||
if 'rir' in augmentation_config.keys():
|
if "rir" in augmentation_config.keys():
|
||||||
self.rir_config = augmentation_config['rir']
|
self.rir_config = augmentation_config["rir"]
|
||||||
if self.rir_config['rir_path']:
|
if self.rir_config["rir_path"]:
|
||||||
self.rir_files = glob.glob(os.path.join(self.rir_config['rir_path'], '**/*.wav'), recursive=True)
|
self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True)
|
||||||
self.use_rir = True
|
self.use_rir = True
|
||||||
|
|
||||||
print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances")
|
print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances")
|
||||||
|
@ -113,7 +115,13 @@ class AugmentWAV(object):
|
||||||
|
|
||||||
clean_db = 10 * np.log10(np.mean(audio ** 2) + 1e-4)
|
clean_db = 10 * np.log10(np.mean(audio ** 2) + 1e-4)
|
||||||
|
|
||||||
noise_list = random.sample(self.noise_list[noise_type], random.randint(self.additive_noise_config[noise_type]['min_num_noises'], self.additive_noise_config[noise_type]['max_num_noises']))
|
noise_list = random.sample(
|
||||||
|
self.noise_list[noise_type],
|
||||||
|
random.randint(
|
||||||
|
self.additive_noise_config[noise_type]["min_num_noises"],
|
||||||
|
self.additive_noise_config[noise_type]["max_num_noises"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
audio_len = audio.shape[0]
|
audio_len = audio.shape[0]
|
||||||
noises_wav = None
|
noises_wav = None
|
||||||
|
@ -123,7 +131,10 @@ class AugmentWAV(object):
|
||||||
if noiseaudio.shape[0] < audio_len:
|
if noiseaudio.shape[0] < audio_len:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
noise_snr = random.uniform(self.additive_noise_config[noise_type]['min_snr_in_db'], self.additive_noise_config[noise_type]['max_num_noises'])
|
noise_snr = random.uniform(
|
||||||
|
self.additive_noise_config[noise_type]["min_snr_in_db"],
|
||||||
|
self.additive_noise_config[noise_type]["max_num_noises"],
|
||||||
|
)
|
||||||
noise_db = 10 * np.log10(np.mean(noiseaudio ** 2) + 1e-4)
|
noise_db = 10 * np.log10(np.mean(noiseaudio ** 2) + 1e-4)
|
||||||
noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio
|
noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio
|
||||||
|
|
||||||
|
@ -144,7 +155,7 @@ class AugmentWAV(object):
|
||||||
rir_file = random.choice(self.rir_files)
|
rir_file = random.choice(self.rir_files)
|
||||||
rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate)
|
rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate)
|
||||||
rir = rir / np.sqrt(np.sum(rir ** 2))
|
rir = rir / np.sqrt(np.sum(rir ** 2))
|
||||||
return signal.convolve(audio, rir, mode=self.rir_config['conv_mode'])[:audio_len]
|
return signal.convolve(audio, rir, mode=self.rir_config["conv_mode"])[:audio_len]
|
||||||
|
|
||||||
def apply_one(self, audio):
|
def apply_one(self, audio):
|
||||||
noise_type = random.choice(self.global_noise_list)
|
noise_type = random.choice(self.global_noise_list)
|
||||||
|
@ -153,17 +164,25 @@ class AugmentWAV(object):
|
||||||
|
|
||||||
return self.additive_noise(noise_type, audio)
|
return self.additive_noise(noise_type, audio)
|
||||||
|
|
||||||
|
|
||||||
def to_camel(text):
|
def to_camel(text):
|
||||||
text = text.capitalize()
|
text = text.capitalize()
|
||||||
return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
|
return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
|
||||||
|
|
||||||
|
|
||||||
def setup_model(c):
|
def setup_model(c):
|
||||||
if c.model_params['model_name'].lower() == 'lstm':
|
if c.model_params["model_name"].lower() == "lstm":
|
||||||
model = LSTMSpeakerEncoder(c.model_params["input_dim"], c.model_params["proj_dim"], c.model_params["lstm_dim"], c.model_params["num_lstm_layers"])
|
model = LSTMSpeakerEncoder(
|
||||||
elif c.model_params['model_name'].lower() == 'resnet':
|
c.model_params["input_dim"],
|
||||||
|
c.model_params["proj_dim"],
|
||||||
|
c.model_params["lstm_dim"],
|
||||||
|
c.model_params["num_lstm_layers"],
|
||||||
|
)
|
||||||
|
elif c.model_params["model_name"].lower() == "resnet":
|
||||||
model = ResNetSpeakerEncoder(input_dim=c.model_params["input_dim"], proj_dim=c.model_params["proj_dim"])
|
model = ResNetSpeakerEncoder(input_dim=c.model_params["input_dim"], proj_dim=c.model_params["proj_dim"])
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
|
def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
|
||||||
checkpoint_path = "checkpoint_{}.pth.tar".format(current_step)
|
checkpoint_path = "checkpoint_{}.pth.tar".format(current_step)
|
||||||
checkpoint_path = os.path.join(out_path, checkpoint_path)
|
checkpoint_path = os.path.join(out_path, checkpoint_path)
|
||||||
|
|
|
@ -441,3 +441,17 @@ def baker(root_path: str, meta_file: str) -> List[List[str]]:
|
||||||
wav_path = os.path.join(root_path, "clips_22", wav_name)
|
wav_path = os.path.join(root_path, "clips_22", wav_name)
|
||||||
items.append([text, wav_path, speaker_name])
|
items.append([text, wav_path, speaker_name])
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
def kokoro(root_path, meta_file):
|
||||||
|
"""Japanese single-speaker dataset from https://github.com/kaiidams/Kokoro-Speech-Dataset"""
|
||||||
|
txt_file = os.path.join(root_path, meta_file)
|
||||||
|
items = []
|
||||||
|
speaker_name = "kokoro"
|
||||||
|
with open(txt_file, "r") as ttf:
|
||||||
|
for line in ttf:
|
||||||
|
cols = line.split("|")
|
||||||
|
wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
|
||||||
|
text = cols[2].replace(" ", "")
|
||||||
|
items.append([text, wav_file, speaker_name])
|
||||||
|
return items
|
||||||
|
|
|
@ -255,6 +255,7 @@ class Tacotron2(TacotronAbstract):
|
||||||
if self.num_speakers > 1:
|
if self.num_speakers > 1:
|
||||||
if not self.embeddings_per_sample:
|
if not self.embeddings_per_sample:
|
||||||
speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
|
speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
|
||||||
|
speaker_embeddings = torch.unsqueeze(speaker_embeddings, 0).transpose(1, 2)
|
||||||
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
|
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
|
||||||
|
|
||||||
decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs)
|
decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs)
|
||||||
|
@ -277,6 +278,7 @@ class Tacotron2(TacotronAbstract):
|
||||||
if self.num_speakers > 1:
|
if self.num_speakers > 1:
|
||||||
if not self.embeddings_per_sample:
|
if not self.embeddings_per_sample:
|
||||||
speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
|
speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
|
||||||
|
speaker_embeddings = torch.unsqueeze(speaker_embeddings, 0).transpose(1, 2)
|
||||||
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
|
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
|
||||||
|
|
||||||
mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(encoder_outputs)
|
mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(encoder_outputs)
|
||||||
|
|
|
@ -6,6 +6,7 @@ from packaging import version
|
||||||
|
|
||||||
from TTS.tts.utils.text import cleaners
|
from TTS.tts.utils.text import cleaners
|
||||||
from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes
|
from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes
|
||||||
|
from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
|
||||||
from TTS.tts.utils.text.symbols import _bos, _eos, _punctuations, make_symbols, phonemes, symbols
|
from TTS.tts.utils.text.symbols import _bos, _eos, _punctuations, make_symbols, phonemes, symbols
|
||||||
|
|
||||||
# pylint: disable=unnecessary-comprehension
|
# pylint: disable=unnecessary-comprehension
|
||||||
|
@ -39,6 +40,11 @@ def text2phone(text, language):
|
||||||
if language == "zh-CN":
|
if language == "zh-CN":
|
||||||
ph = chinese_text_to_phonemes(text)
|
ph = chinese_text_to_phonemes(text)
|
||||||
return ph
|
return ph
|
||||||
|
|
||||||
|
if language == "ja-jp":
|
||||||
|
ph = japanese_text_to_phonemes(text)
|
||||||
|
return ph
|
||||||
|
|
||||||
raise ValueError(f" [!] Language {language} is not supported for phonemization.")
|
raise ValueError(f" [!] Language {language} is not supported for phonemization.")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,18 +1,6 @@
|
||||||
"""
|
|
||||||
Cleaners are transformations that run over the input text at both training and eval time.
|
|
||||||
|
|
||||||
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
|
|
||||||
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
|
|
||||||
1. "english_cleaners" for English text
|
|
||||||
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
|
|
||||||
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
|
||||||
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
|
||||||
the symbols in symbols.py to match your data).
|
|
||||||
"""
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from unidecode import unidecode
|
from anyascii import anyascii
|
||||||
|
|
||||||
from TTS.tts.utils.text.chinese_mandarin.numbers import replace_numbers_to_characters_in_text
|
from TTS.tts.utils.text.chinese_mandarin.numbers import replace_numbers_to_characters_in_text
|
||||||
|
|
||||||
|
@ -47,7 +35,7 @@ def collapse_whitespace(text):
|
||||||
|
|
||||||
|
|
||||||
def convert_to_ascii(text):
|
def convert_to_ascii(text):
|
||||||
return unidecode(text)
|
return anyascii(text)
|
||||||
|
|
||||||
|
|
||||||
def remove_aux_symbols(text):
|
def remove_aux_symbols(text):
|
||||||
|
|
|
@ -0,0 +1,380 @@
|
||||||
|
# Convert Japanese text to phonemes which is
|
||||||
|
# compatible with Julius https://github.com/julius-speech/segmentation-kit
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
import MeCab
|
||||||
|
|
||||||
|
_CONVRULES = [
|
||||||
|
# Conversion of 2 letters
|
||||||
|
"アァ/ a a",
|
||||||
|
"イィ/ i i",
|
||||||
|
"イェ/ i e",
|
||||||
|
"イャ/ y a",
|
||||||
|
"ウゥ/ u:",
|
||||||
|
"エェ/ e e",
|
||||||
|
"オォ/ o:",
|
||||||
|
"カァ/ k a:",
|
||||||
|
"キィ/ k i:",
|
||||||
|
"クゥ/ k u:",
|
||||||
|
"クャ/ ky a",
|
||||||
|
"クュ/ ky u",
|
||||||
|
"クョ/ ky o",
|
||||||
|
"ケェ/ k e:",
|
||||||
|
"コォ/ k o:",
|
||||||
|
"ガァ/ g a:",
|
||||||
|
"ギィ/ g i:",
|
||||||
|
"グゥ/ g u:",
|
||||||
|
"グャ/ gy a",
|
||||||
|
"グュ/ gy u",
|
||||||
|
"グョ/ gy o",
|
||||||
|
"ゲェ/ g e:",
|
||||||
|
"ゴォ/ g o:",
|
||||||
|
"サァ/ s a:",
|
||||||
|
"シィ/ sh i:",
|
||||||
|
"スゥ/ s u:",
|
||||||
|
"スャ/ sh a",
|
||||||
|
"スュ/ sh u",
|
||||||
|
"スョ/ sh o",
|
||||||
|
"セェ/ s e:",
|
||||||
|
"ソォ/ s o:",
|
||||||
|
"ザァ/ z a:",
|
||||||
|
"ジィ/ j i:",
|
||||||
|
"ズゥ/ z u:",
|
||||||
|
"ズャ/ zy a",
|
||||||
|
"ズュ/ zy u",
|
||||||
|
"ズョ/ zy o",
|
||||||
|
"ゼェ/ z e:",
|
||||||
|
"ゾォ/ z o:",
|
||||||
|
"タァ/ t a:",
|
||||||
|
"チィ/ ch i:",
|
||||||
|
"ツァ/ ts a",
|
||||||
|
"ツィ/ ts i",
|
||||||
|
"ツゥ/ ts u:",
|
||||||
|
"ツャ/ ch a",
|
||||||
|
"ツュ/ ch u",
|
||||||
|
"ツョ/ ch o",
|
||||||
|
"ツェ/ ts e",
|
||||||
|
"ツォ/ ts o",
|
||||||
|
"テェ/ t e:",
|
||||||
|
"トォ/ t o:",
|
||||||
|
"ダァ/ d a:",
|
||||||
|
"ヂィ/ j i:",
|
||||||
|
"ヅゥ/ d u:",
|
||||||
|
"ヅャ/ zy a",
|
||||||
|
"ヅュ/ zy u",
|
||||||
|
"ヅョ/ zy o",
|
||||||
|
"デェ/ d e:",
|
||||||
|
"ドォ/ d o:",
|
||||||
|
"ナァ/ n a:",
|
||||||
|
"ニィ/ n i:",
|
||||||
|
"ヌゥ/ n u:",
|
||||||
|
"ヌャ/ ny a",
|
||||||
|
"ヌュ/ ny u",
|
||||||
|
"ヌョ/ ny o",
|
||||||
|
"ネェ/ n e:",
|
||||||
|
"ノォ/ n o:",
|
||||||
|
"ハァ/ h a:",
|
||||||
|
"ヒィ/ h i:",
|
||||||
|
"フゥ/ f u:",
|
||||||
|
"フャ/ hy a",
|
||||||
|
"フュ/ hy u",
|
||||||
|
"フョ/ hy o",
|
||||||
|
"ヘェ/ h e:",
|
||||||
|
"ホォ/ h o:",
|
||||||
|
"バァ/ b a:",
|
||||||
|
"ビィ/ b i:",
|
||||||
|
"ブゥ/ b u:",
|
||||||
|
"フャ/ hy a",
|
||||||
|
"ブュ/ by u",
|
||||||
|
"フョ/ hy o",
|
||||||
|
"ベェ/ b e:",
|
||||||
|
"ボォ/ b o:",
|
||||||
|
"パァ/ p a:",
|
||||||
|
"ピィ/ p i:",
|
||||||
|
"プゥ/ p u:",
|
||||||
|
"プャ/ py a",
|
||||||
|
"プュ/ py u",
|
||||||
|
"プョ/ py o",
|
||||||
|
"ペェ/ p e:",
|
||||||
|
"ポォ/ p o:",
|
||||||
|
"マァ/ m a:",
|
||||||
|
"ミィ/ m i:",
|
||||||
|
"ムゥ/ m u:",
|
||||||
|
"ムャ/ my a",
|
||||||
|
"ムュ/ my u",
|
||||||
|
"ムョ/ my o",
|
||||||
|
"メェ/ m e:",
|
||||||
|
"モォ/ m o:",
|
||||||
|
"ヤァ/ y a:",
|
||||||
|
"ユゥ/ y u:",
|
||||||
|
"ユャ/ y a:",
|
||||||
|
"ユュ/ y u:",
|
||||||
|
"ユョ/ y o:",
|
||||||
|
"ヨォ/ y o:",
|
||||||
|
"ラァ/ r a:",
|
||||||
|
"リィ/ r i:",
|
||||||
|
"ルゥ/ r u:",
|
||||||
|
"ルャ/ ry a",
|
||||||
|
"ルュ/ ry u",
|
||||||
|
"ルョ/ ry o",
|
||||||
|
"レェ/ r e:",
|
||||||
|
"ロォ/ r o:",
|
||||||
|
"ワァ/ w a:",
|
||||||
|
"ヲォ/ o:",
|
||||||
|
"ディ/ d i",
|
||||||
|
"デェ/ d e:",
|
||||||
|
"デャ/ dy a",
|
||||||
|
"デュ/ dy u",
|
||||||
|
"デョ/ dy o",
|
||||||
|
"ティ/ t i",
|
||||||
|
"テェ/ t e:",
|
||||||
|
"テャ/ ty a",
|
||||||
|
"テュ/ ty u",
|
||||||
|
"テョ/ ty o",
|
||||||
|
"スィ/ s i",
|
||||||
|
"ズァ/ z u a",
|
||||||
|
"ズィ/ z i",
|
||||||
|
"ズゥ/ z u",
|
||||||
|
"ズャ/ zy a",
|
||||||
|
"ズュ/ zy u",
|
||||||
|
"ズョ/ zy o",
|
||||||
|
"ズェ/ z e",
|
||||||
|
"ズォ/ z o",
|
||||||
|
"キャ/ ky a",
|
||||||
|
"キュ/ ky u",
|
||||||
|
"キョ/ ky o",
|
||||||
|
"シャ/ sh a",
|
||||||
|
"シュ/ sh u",
|
||||||
|
"シェ/ sh e",
|
||||||
|
"ショ/ sh o",
|
||||||
|
"チャ/ ch a",
|
||||||
|
"チュ/ ch u",
|
||||||
|
"チェ/ ch e",
|
||||||
|
"チョ/ ch o",
|
||||||
|
"トゥ/ t u",
|
||||||
|
"トャ/ ty a",
|
||||||
|
"トュ/ ty u",
|
||||||
|
"トョ/ ty o",
|
||||||
|
"ドァ/ d o a",
|
||||||
|
"ドゥ/ d u",
|
||||||
|
"ドャ/ dy a",
|
||||||
|
"ドュ/ dy u",
|
||||||
|
"ドョ/ dy o",
|
||||||
|
"ドォ/ d o:",
|
||||||
|
"ニャ/ ny a",
|
||||||
|
"ニュ/ ny u",
|
||||||
|
"ニョ/ ny o",
|
||||||
|
"ヒャ/ hy a",
|
||||||
|
"ヒュ/ hy u",
|
||||||
|
"ヒョ/ hy o",
|
||||||
|
"ミャ/ my a",
|
||||||
|
"ミュ/ my u",
|
||||||
|
"ミョ/ my o",
|
||||||
|
"リャ/ ry a",
|
||||||
|
"リュ/ ry u",
|
||||||
|
"リョ/ ry o",
|
||||||
|
"ギャ/ gy a",
|
||||||
|
"ギュ/ gy u",
|
||||||
|
"ギョ/ gy o",
|
||||||
|
"ヂェ/ j e",
|
||||||
|
"ヂャ/ j a",
|
||||||
|
"ヂュ/ j u",
|
||||||
|
"ヂョ/ j o",
|
||||||
|
"ジェ/ j e",
|
||||||
|
"ジャ/ j a",
|
||||||
|
"ジュ/ j u",
|
||||||
|
"ジョ/ j o",
|
||||||
|
"ビャ/ by a",
|
||||||
|
"ビュ/ by u",
|
||||||
|
"ビョ/ by o",
|
||||||
|
"ピャ/ py a",
|
||||||
|
"ピュ/ py u",
|
||||||
|
"ピョ/ py o",
|
||||||
|
"ウァ/ u a",
|
||||||
|
"ウィ/ w i",
|
||||||
|
"ウェ/ w e",
|
||||||
|
"ウォ/ w o",
|
||||||
|
"ファ/ f a",
|
||||||
|
"フィ/ f i",
|
||||||
|
"フゥ/ f u",
|
||||||
|
"フャ/ hy a",
|
||||||
|
"フュ/ hy u",
|
||||||
|
"フョ/ hy o",
|
||||||
|
"フェ/ f e",
|
||||||
|
"フォ/ f o",
|
||||||
|
"ヴァ/ b a",
|
||||||
|
"ヴィ/ b i",
|
||||||
|
"ヴェ/ b e",
|
||||||
|
"ヴォ/ b o",
|
||||||
|
"ヴュ/ by u",
|
||||||
|
# Conversion of 1 letter
|
||||||
|
"ア/ a",
|
||||||
|
"イ/ i",
|
||||||
|
"ウ/ u",
|
||||||
|
"エ/ e",
|
||||||
|
"オ/ o",
|
||||||
|
"カ/ k a",
|
||||||
|
"キ/ k i",
|
||||||
|
"ク/ k u",
|
||||||
|
"ケ/ k e",
|
||||||
|
"コ/ k o",
|
||||||
|
"サ/ s a",
|
||||||
|
"シ/ sh i",
|
||||||
|
"ス/ s u",
|
||||||
|
"セ/ s e",
|
||||||
|
"ソ/ s o",
|
||||||
|
"タ/ t a",
|
||||||
|
"チ/ ch i",
|
||||||
|
"ツ/ ts u",
|
||||||
|
"テ/ t e",
|
||||||
|
"ト/ t o",
|
||||||
|
"ナ/ n a",
|
||||||
|
"ニ/ n i",
|
||||||
|
"ヌ/ n u",
|
||||||
|
"ネ/ n e",
|
||||||
|
"ノ/ n o",
|
||||||
|
"ハ/ h a",
|
||||||
|
"ヒ/ h i",
|
||||||
|
"フ/ f u",
|
||||||
|
"ヘ/ h e",
|
||||||
|
"ホ/ h o",
|
||||||
|
"マ/ m a",
|
||||||
|
"ミ/ m i",
|
||||||
|
"ム/ m u",
|
||||||
|
"メ/ m e",
|
||||||
|
"モ/ m o",
|
||||||
|
"ラ/ r a",
|
||||||
|
"リ/ r i",
|
||||||
|
"ル/ r u",
|
||||||
|
"レ/ r e",
|
||||||
|
"ロ/ r o",
|
||||||
|
"ガ/ g a",
|
||||||
|
"ギ/ g i",
|
||||||
|
"グ/ g u",
|
||||||
|
"ゲ/ g e",
|
||||||
|
"ゴ/ g o",
|
||||||
|
"ザ/ z a",
|
||||||
|
"ジ/ j i",
|
||||||
|
"ズ/ z u",
|
||||||
|
"ゼ/ z e",
|
||||||
|
"ゾ/ z o",
|
||||||
|
"ダ/ d a",
|
||||||
|
"ヂ/ j i",
|
||||||
|
"ヅ/ z u",
|
||||||
|
"デ/ d e",
|
||||||
|
"ド/ d o",
|
||||||
|
"バ/ b a",
|
||||||
|
"ビ/ b i",
|
||||||
|
"ブ/ b u",
|
||||||
|
"ベ/ b e",
|
||||||
|
"ボ/ b o",
|
||||||
|
"パ/ p a",
|
||||||
|
"ピ/ p i",
|
||||||
|
"プ/ p u",
|
||||||
|
"ペ/ p e",
|
||||||
|
"ポ/ p o",
|
||||||
|
"ヤ/ y a",
|
||||||
|
"ユ/ y u",
|
||||||
|
"ヨ/ y o",
|
||||||
|
"ワ/ w a",
|
||||||
|
"ヰ/ i",
|
||||||
|
"ヱ/ e",
|
||||||
|
"ヲ/ o",
|
||||||
|
"ン/ N",
|
||||||
|
"ッ/ q",
|
||||||
|
"ヴ/ b u",
|
||||||
|
"ー/:",
|
||||||
|
# Try converting broken text
|
||||||
|
"ァ/ a",
|
||||||
|
"ィ/ i",
|
||||||
|
"ゥ/ u",
|
||||||
|
"ェ/ e",
|
||||||
|
"ォ/ o",
|
||||||
|
"ヮ/ w a",
|
||||||
|
"ォ/ o",
|
||||||
|
# Symbols
|
||||||
|
"、/ ,",
|
||||||
|
"。/ .",
|
||||||
|
"!/ !",
|
||||||
|
"?/ ?",
|
||||||
|
"・/ ,",
|
||||||
|
]
|
||||||
|
|
||||||
|
_COLON_RX = re.compile(":+")
|
||||||
|
_REJECT_RX = re.compile("[^ a-zA-Z:,.?]")
|
||||||
|
|
||||||
|
|
||||||
|
def _makerulemap():
|
||||||
|
l = [tuple(x.split("/")) for x in _CONVRULES]
|
||||||
|
return tuple({k: v for k, v in l if len(k) == i} for i in (1, 2))
|
||||||
|
|
||||||
|
|
||||||
|
_RULEMAP1, _RULEMAP2 = _makerulemap()
|
||||||
|
|
||||||
|
|
||||||
|
def kata2phoneme(text: str) -> str:
|
||||||
|
"""Convert katakana text to phonemes."""
|
||||||
|
text = text.strip()
|
||||||
|
res = ""
|
||||||
|
while text:
|
||||||
|
if len(text) >= 2:
|
||||||
|
x = _RULEMAP2.get(text[:2])
|
||||||
|
if x is not None:
|
||||||
|
text = text[2:]
|
||||||
|
res += x
|
||||||
|
continue
|
||||||
|
x = _RULEMAP1.get(text[0])
|
||||||
|
if x is not None:
|
||||||
|
text = text[1:]
|
||||||
|
res += x
|
||||||
|
continue
|
||||||
|
res += " " + text[0]
|
||||||
|
text = text[1:]
|
||||||
|
res = _COLON_RX.sub(":", res)
|
||||||
|
return res[1:]
|
||||||
|
|
||||||
|
|
||||||
|
_KATAKANA = "".join(chr(ch) for ch in range(ord("ァ"), ord("ン") + 1))
|
||||||
|
_HIRAGANA = "".join(chr(ch) for ch in range(ord("ぁ"), ord("ん") + 1))
|
||||||
|
_HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA)
|
||||||
|
|
||||||
|
|
||||||
|
def hira2kata(text: str) -> str:
|
||||||
|
text = text.translate(_HIRA2KATATRANS)
|
||||||
|
return text.replace("う゛", "ヴ")
|
||||||
|
|
||||||
|
|
||||||
|
_SYMBOL_TOKENS = set(list("・、。?!"))
|
||||||
|
_NO_YOMI_TOKENS = set(list("「」『』―()[][] …"))
|
||||||
|
_TAGGER = MeCab.Tagger()
|
||||||
|
|
||||||
|
|
||||||
|
def text2kata(text: str) -> str:
|
||||||
|
parsed = _TAGGER.parse(text)
|
||||||
|
res = []
|
||||||
|
for line in parsed.split("\n"):
|
||||||
|
if line == "EOS":
|
||||||
|
break
|
||||||
|
parts = line.split("\t")
|
||||||
|
|
||||||
|
word, yomi = parts[0], parts[1]
|
||||||
|
if yomi:
|
||||||
|
res.append(yomi)
|
||||||
|
else:
|
||||||
|
if word in _SYMBOL_TOKENS:
|
||||||
|
res.append(word)
|
||||||
|
elif word in ("っ", "ッ"):
|
||||||
|
res.append("ッ")
|
||||||
|
elif word in _NO_YOMI_TOKENS:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
res.append(word)
|
||||||
|
return hira2kata("".join(res))
|
||||||
|
|
||||||
|
|
||||||
|
def japanese_text_to_phonemes(text: str) -> str:
|
||||||
|
"""Convert Japanese text to phonemes."""
|
||||||
|
res = text2kata(text)
|
||||||
|
res = kata2phoneme(res)
|
||||||
|
return res.replace(" ", "")
|
|
@ -152,6 +152,7 @@ def process_args(args):
|
||||||
experiment_path = create_experiment_folder(config.output_path, config.run_name, args.debug)
|
experiment_path = create_experiment_folder(config.output_path, config.run_name, args.debug)
|
||||||
audio_path = os.path.join(experiment_path, "test_audios")
|
audio_path = os.path.join(experiment_path, "test_audios")
|
||||||
# setup rank 0 process in distributed training
|
# setup rank 0 process in distributed training
|
||||||
|
tb_logger = None
|
||||||
if args.rank == 0:
|
if args.rank == 0:
|
||||||
os.makedirs(audio_path, exist_ok=True)
|
os.makedirs(audio_path, exist_ok=True)
|
||||||
new_fields = {}
|
new_fields = {}
|
||||||
|
|
|
@ -149,7 +149,7 @@ class ModelManager(object):
|
||||||
def _download_zip_file(file_url, output):
|
def _download_zip_file(file_url, output):
|
||||||
"""Download the github releases"""
|
"""Download the github releases"""
|
||||||
r = requests.get(file_url)
|
r = requests.get(file_url)
|
||||||
z = zipfile.ZipFile(io.BytesIO(r.content))
|
with zipfile.ZipFile(io.BytesIO(r.content)) as z:
|
||||||
z.extractall(output)
|
z.extractall(output)
|
||||||
for file_path in z.namelist()[1:]:
|
for file_path in z.namelist()[1:]:
|
||||||
src_path = os.path.join(output, file_path)
|
src_path = os.path.join(output, file_path)
|
||||||
|
|
|
@ -0,0 +1,23 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# take the scripts's parent's directory to prefix all the output paths.
|
||||||
|
RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||||
|
CORPUS=kokoro-speech-v1_1-small
|
||||||
|
echo $RUN_DIR
|
||||||
|
if [ \! -d $RUN_DIR/$CORPUS ] ; then
|
||||||
|
echo "$RUN_DIR/$CORPUS doesn't exist."
|
||||||
|
echo "Follow the instruction of https://github.com/kaiidams/Kokoro-Speech-Dataset to make the corpus."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
# create train-val splits
|
||||||
|
shuf $RUN_DIR/$CORPUS/metadata.csv > $RUN_DIR/$CORPUS/metadata_shuf.csv
|
||||||
|
head -n 8000 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_train.csv
|
||||||
|
tail -n 812 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_val.csv
|
||||||
|
# compute dataset mean and variance for normalization
|
||||||
|
python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/$CORPUS/wavs/
|
||||||
|
# training ....
|
||||||
|
# change the GPU id if needed
|
||||||
|
CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tacotron.py --config_path $RUN_DIR/tacotron2-DDC.json \
|
||||||
|
--coqpit.output_path $RUN_DIR \
|
||||||
|
--coqpit.datasets.0.path $RUN_DIR/$CORPUS \
|
||||||
|
--coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \
|
||||||
|
--coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \
|
|
@ -0,0 +1,125 @@
|
||||||
|
{
|
||||||
|
"datasets": [
|
||||||
|
{
|
||||||
|
"name": "kokoro",
|
||||||
|
"path": "DEFINE THIS",
|
||||||
|
"meta_file_train": "metadata.csv",
|
||||||
|
"meta_file_val": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"audio": {
|
||||||
|
"fft_size": 1024,
|
||||||
|
"win_length": 1024,
|
||||||
|
"hop_length": 256,
|
||||||
|
"frame_length_ms": null,
|
||||||
|
"frame_shift_ms": null,
|
||||||
|
"sample_rate": 22050,
|
||||||
|
"preemphasis": 0.0,
|
||||||
|
"ref_level_db": 20,
|
||||||
|
"do_trim_silence": true,
|
||||||
|
"trim_db": 60,
|
||||||
|
"power": 1.5,
|
||||||
|
"griffin_lim_iters": 60,
|
||||||
|
"num_mels": 80,
|
||||||
|
"mel_fmin": 50.0,
|
||||||
|
"mel_fmax": 7600.0,
|
||||||
|
"spec_gain": 1,
|
||||||
|
"signal_norm": true,
|
||||||
|
"min_level_db": -100,
|
||||||
|
"symmetric_norm": true,
|
||||||
|
"max_norm": 4.0,
|
||||||
|
"clip_norm": true,
|
||||||
|
"stats_path": "scale_stats.npy"
|
||||||
|
},
|
||||||
|
"gst":{
|
||||||
|
"gst_style_input": null,
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"gst_embedding_dim": 512,
|
||||||
|
"gst_num_heads": 4,
|
||||||
|
"gst_style_tokens": 10,
|
||||||
|
"gst_use_speaker_embedding": false
|
||||||
|
},
|
||||||
|
"model": "Tacotron2",
|
||||||
|
"run_name": "kokoro-ddc",
|
||||||
|
"run_description": "tacotron2 with DDC and differential spectral loss.",
|
||||||
|
"batch_size": 32,
|
||||||
|
"eval_batch_size": 16,
|
||||||
|
"mixed_precision": true,
|
||||||
|
"distributed": {
|
||||||
|
"backend": "nccl",
|
||||||
|
"url": "tcp:\/\/localhost:54321"
|
||||||
|
},
|
||||||
|
"reinit_layers": [],
|
||||||
|
"loss_masking": true,
|
||||||
|
"decoder_loss_alpha": 0.5,
|
||||||
|
"postnet_loss_alpha": 0.25,
|
||||||
|
"postnet_diff_spec_alpha": 0.25,
|
||||||
|
"decoder_diff_spec_alpha": 0.25,
|
||||||
|
"decoder_ssim_alpha": 0.5,
|
||||||
|
"postnet_ssim_alpha": 0.25,
|
||||||
|
"ga_alpha": 5.0,
|
||||||
|
"stopnet_pos_weight": 15.0,
|
||||||
|
"run_eval": true,
|
||||||
|
"test_delay_epochs": 10,
|
||||||
|
"test_sentences_file": null,
|
||||||
|
"noam_schedule": false,
|
||||||
|
"grad_clip": 1.0,
|
||||||
|
"epochs": 1000,
|
||||||
|
"lr": 0.0001,
|
||||||
|
"wd": 0.000001,
|
||||||
|
"warmup_steps": 4000,
|
||||||
|
"seq_len_norm": false,
|
||||||
|
"memory_size": -1,
|
||||||
|
"prenet_type": "original",
|
||||||
|
"prenet_dropout": true,
|
||||||
|
"attention_type": "original",
|
||||||
|
"windowing": false,
|
||||||
|
"use_forward_attn": false,
|
||||||
|
"forward_attn_mask": false,
|
||||||
|
"transition_agent": false,
|
||||||
|
"location_attn": true,
|
||||||
|
"bidirectional_decoder": false,
|
||||||
|
"double_decoder_consistency": true,
|
||||||
|
"ddc_r": 7,
|
||||||
|
"attention_heads": 4,
|
||||||
|
"attention_norm": "sigmoid",
|
||||||
|
"r": 7,
|
||||||
|
"gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]],
|
||||||
|
"stopnet": true,
|
||||||
|
"separate_stopnet": true,
|
||||||
|
"print_step": 25,
|
||||||
|
"tb_plot_step": 100,
|
||||||
|
"print_eval": false,
|
||||||
|
"save_step": 10000,
|
||||||
|
"checkpoint": true,
|
||||||
|
"keep_all_best": false,
|
||||||
|
"keep_after": 10000,
|
||||||
|
"tb_model_param_stats": false,
|
||||||
|
"text_cleaner": "basic_cleaners",
|
||||||
|
"enable_eos_bos_chars": false,
|
||||||
|
"num_loader_workers": 4,
|
||||||
|
"num_val_loader_workers": 4,
|
||||||
|
"batch_group_size": 4,
|
||||||
|
"min_seq_len": 6,
|
||||||
|
"max_seq_len": 153,
|
||||||
|
"compute_input_seq_cache": false,
|
||||||
|
"use_noise_augment": true,
|
||||||
|
"output_path": "DEFINE THIS",
|
||||||
|
"phoneme_cache_path": "DEFINE THIS",
|
||||||
|
"use_phonemes": true,
|
||||||
|
"phoneme_language": "ja-jp",
|
||||||
|
"characters": {
|
||||||
|
"pad": "_",
|
||||||
|
"eos": "~",
|
||||||
|
"bos": "^",
|
||||||
|
"characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
|
||||||
|
"punctuations": "!'(),-.:;? ",
|
||||||
|
"phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
||||||
|
},
|
||||||
|
"use_speaker_embedding": false,
|
||||||
|
"use_gst": false,
|
||||||
|
"use_external_speaker_embedding_file": false,
|
||||||
|
"external_speaker_embedding_file": "../../speakers-vctk-en.json"
|
||||||
|
}
|
|
@ -2,4 +2,4 @@ black
|
||||||
coverage
|
coverage
|
||||||
isort
|
isort
|
||||||
nose
|
nose
|
||||||
pylint==2.7.4
|
pylint==2.8.3
|
||||||
|
|
|
@ -17,5 +17,8 @@ torch>=1.7
|
||||||
tqdm
|
tqdm
|
||||||
numba==0.52
|
numba==0.52
|
||||||
umap-learn==0.4.6
|
umap-learn==0.4.6
|
||||||
unidecode==0.4.20
|
anyascii
|
||||||
coqpit
|
coqpit
|
||||||
|
# japanese g2p deps
|
||||||
|
mecab-python3==1.0.3
|
||||||
|
unidic-lite==1.0.8
|
||||||
|
|
91
setup.py
91
setup.py
|
@ -4,7 +4,6 @@ import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
from distutils.version import LooseVersion
|
from distutils.version import LooseVersion
|
||||||
from TTS._version import __version__
|
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import setuptools.command.build_py
|
import setuptools.command.build_py
|
||||||
|
@ -12,82 +11,85 @@ import setuptools.command.develop
|
||||||
from Cython.Build import cythonize
|
from Cython.Build import cythonize
|
||||||
from setuptools import Extension, find_packages, setup
|
from setuptools import Extension, find_packages, setup
|
||||||
|
|
||||||
|
|
||||||
if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.9"):
|
if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.9"):
|
||||||
raise RuntimeError(
|
raise RuntimeError("TTS requires python >= 3.6 and <3.9 " "but your Python version is {}".format(sys.version))
|
||||||
"TTS requires python >= 3.6 and <3.9 "
|
|
||||||
"but your Python version is {}".format(sys.version)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
version = __version__
|
|
||||||
cwd = os.path.dirname(os.path.abspath(__file__))
|
cwd = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
|
cwd = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
with open(os.path.join(cwd, "TTS", "VERSION")) as fin:
|
||||||
|
version = fin.read().strip()
|
||||||
|
|
||||||
|
|
||||||
class build_py(setuptools.command.build_py.build_py): # pylint: disable=too-many-ancestors
|
class build_py(setuptools.command.build_py.build_py): # pylint: disable=too-many-ancestors
|
||||||
def run(self):
|
def run(self):
|
||||||
self.create_version_file()
|
|
||||||
setuptools.command.build_py.build_py.run(self)
|
setuptools.command.build_py.build_py.run(self)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def create_version_file():
|
|
||||||
print('-- Building version ' + version)
|
|
||||||
version_path = os.path.join(cwd, 'version.py')
|
|
||||||
with open(version_path, 'w') as f:
|
|
||||||
f.write("__version__ = '{}'\n".format(version))
|
|
||||||
|
|
||||||
class develop(setuptools.command.develop.develop):
|
class develop(setuptools.command.develop.develop):
|
||||||
def run(self):
|
def run(self):
|
||||||
build_py.create_version_file()
|
|
||||||
setuptools.command.develop.develop.run(self)
|
setuptools.command.develop.develop.run(self)
|
||||||
|
|
||||||
|
|
||||||
# The documentation for this feature is in server/README.md
|
# The documentation for this feature is in server/README.md
|
||||||
package_data = ['TTS/server/templates/*']
|
package_data = ["TTS/server/templates/*"]
|
||||||
|
|
||||||
|
|
||||||
def pip_install(package_name):
|
def pip_install(package_name):
|
||||||
subprocess.call([sys.executable, '-m', 'pip', 'install', package_name])
|
subprocess.call([sys.executable, "-m", "pip", "install", package_name])
|
||||||
|
|
||||||
|
|
||||||
requirements = open(os.path.join(cwd, 'requirements.txt'), 'r').readlines()
|
requirements = open(os.path.join(cwd, "requirements.txt"), "r").readlines()
|
||||||
with open(os.path.join(cwd, 'requirements.notebooks.txt'), 'r') as f:
|
with open(os.path.join(cwd, "requirements.notebooks.txt"), "r") as f:
|
||||||
requirements_notebooks = f.readlines()
|
requirements_notebooks = f.readlines()
|
||||||
with open(os.path.join(cwd, 'requirements.dev.txt'), 'r') as f:
|
with open(os.path.join(cwd, "requirements.dev.txt"), "r") as f:
|
||||||
requirements_dev = f.readlines()
|
requirements_dev = f.readlines()
|
||||||
with open(os.path.join(cwd, 'requirements.tf.txt'), 'r') as f:
|
with open(os.path.join(cwd, "requirements.tf.txt"), "r") as f:
|
||||||
requirements_tf = f.readlines()
|
requirements_tf = f.readlines()
|
||||||
requirements_all = requirements_dev + requirements_notebooks + requirements_tf
|
requirements_all = requirements_dev + requirements_notebooks + requirements_tf
|
||||||
|
|
||||||
with open('README.md', "r", encoding="utf-8") as readme_file:
|
with open("README.md", "r", encoding="utf-8") as readme_file:
|
||||||
README = readme_file.read()
|
README = readme_file.read()
|
||||||
|
|
||||||
exts = [Extension(name='TTS.tts.layers.glow_tts.monotonic_align.core',
|
exts = [
|
||||||
sources=["TTS/tts/layers/glow_tts/monotonic_align/core.pyx"])]
|
Extension(
|
||||||
|
name="TTS.tts.layers.glow_tts.monotonic_align.core",
|
||||||
|
sources=["TTS/tts/layers/glow_tts/monotonic_align/core.pyx"],
|
||||||
|
)
|
||||||
|
]
|
||||||
setup(
|
setup(
|
||||||
name='TTS',
|
name="TTS",
|
||||||
version=version,
|
version=version,
|
||||||
url='https://github.com/coqui-ai/TTS',
|
url="https://github.com/coqui-ai/TTS",
|
||||||
author='Eren Gölge',
|
author="Eren Gölge",
|
||||||
author_email='egolge@coqui.ai',
|
author_email="egolge@coqui.ai",
|
||||||
description='Deep learning for Text to Speech by Coqui.',
|
description="Deep learning for Text to Speech by Coqui.",
|
||||||
long_description=README,
|
long_description=README,
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
license='MPL-2.0',
|
license="MPL-2.0",
|
||||||
# cython
|
# cython
|
||||||
include_dirs=numpy.get_include(),
|
include_dirs=numpy.get_include(),
|
||||||
ext_modules=cythonize(exts, language_level=3),
|
ext_modules=cythonize(exts, language_level=3),
|
||||||
# ext_modules=find_cython_extensions(),
|
# ext_modules=find_cython_extensions(),
|
||||||
# package
|
# package
|
||||||
include_package_data=True,
|
include_package_data=True,
|
||||||
packages=find_packages(include=['TTS*']),
|
packages=find_packages(include=["TTS*"]),
|
||||||
|
package_data={
|
||||||
|
"TTS": [
|
||||||
|
"VERSION",
|
||||||
|
]
|
||||||
|
},
|
||||||
project_urls={
|
project_urls={
|
||||||
'Documentation': 'https://github.com/coqui-ai/TTS/wiki',
|
"Documentation": "https://github.com/coqui-ai/TTS/wiki",
|
||||||
'Tracker': 'https://github.com/coqui-ai/TTS/issues',
|
"Tracker": "https://github.com/coqui-ai/TTS/issues",
|
||||||
'Repository': 'https://github.com/coqui-ai/TTS',
|
"Repository": "https://github.com/coqui-ai/TTS",
|
||||||
'Discussions': 'https://github.com/coqui-ai/TTS/discussions',
|
"Discussions": "https://github.com/coqui-ai/TTS/discussions",
|
||||||
},
|
},
|
||||||
cmdclass={
|
cmdclass={
|
||||||
'build_py': build_py,
|
"build_py": build_py,
|
||||||
'develop': develop,
|
"develop": develop,
|
||||||
# 'build_ext': build_ext
|
# 'build_ext': build_ext
|
||||||
},
|
},
|
||||||
install_requires=requirements,
|
install_requires=requirements,
|
||||||
|
@ -97,30 +99,25 @@ setup(
|
||||||
"notebooks": requirements_notebooks,
|
"notebooks": requirements_notebooks,
|
||||||
"tf": requirements_tf,
|
"tf": requirements_tf,
|
||||||
},
|
},
|
||||||
python_requires='>=3.6.0, <3.9',
|
python_requires=">=3.6.0, <3.9",
|
||||||
entry_points={
|
entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]},
|
||||||
'console_scripts': [
|
|
||||||
'tts=TTS.bin.synthesize:main',
|
|
||||||
'tts-server = TTS.server.server:main'
|
|
||||||
]
|
|
||||||
},
|
|
||||||
classifiers=[
|
classifiers=[
|
||||||
"Programming Language :: Python",
|
"Programming Language :: Python",
|
||||||
"Programming Language :: Python :: 3",
|
"Programming Language :: Python :: 3",
|
||||||
"Programming Language :: Python :: 3.6",
|
"Programming Language :: Python :: 3.6",
|
||||||
"Programming Language :: Python :: 3.7",
|
"Programming Language :: Python :: 3.7",
|
||||||
"Programming Language :: Python :: 3.8",
|
"Programming Language :: Python :: 3.8",
|
||||||
'Development Status :: 3 - Alpha',
|
"Development Status :: 3 - Alpha",
|
||||||
"Intended Audience :: Science/Research",
|
"Intended Audience :: Science/Research",
|
||||||
"Intended Audience :: Developers",
|
"Intended Audience :: Developers",
|
||||||
"Operating System :: POSIX :: Linux",
|
"Operating System :: POSIX :: Linux",
|
||||||
'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)',
|
"License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)",
|
||||||
"Topic :: Software Development",
|
"Topic :: Software Development",
|
||||||
"Topic :: Software Development :: Libraries :: Python Modules",
|
"Topic :: Software Development :: Libraries :: Python Modules",
|
||||||
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
||||||
"Topic :: Multimedia :: Sound/Audio",
|
"Topic :: Multimedia :: Sound/Audio",
|
||||||
"Topic :: Multimedia",
|
"Topic :: Multimedia",
|
||||||
"Topic :: Scientific/Engineering :: Artificial Intelligence"
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||||
],
|
],
|
||||||
zip_safe=False
|
zip_safe=False,
|
||||||
)
|
)
|
||||||
|
|
|
@ -6,6 +6,7 @@ from tests import get_tests_input_path
|
||||||
from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
|
from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
|
||||||
from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder
|
from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder
|
||||||
from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder
|
from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder
|
||||||
|
|
||||||
file_path = get_tests_input_path()
|
file_path = get_tests_input_path()
|
||||||
|
|
||||||
|
|
||||||
|
@ -39,6 +40,7 @@ class LSTMSpeakerEncoderTests(unittest.TestCase):
|
||||||
assert output.shape[1] == 256
|
assert output.shape[1] == 256
|
||||||
assert len(output.shape) == 2
|
assert len(output.shape) == 2
|
||||||
|
|
||||||
|
|
||||||
class ResNetSpeakerEncoderTests(unittest.TestCase):
|
class ResNetSpeakerEncoderTests(unittest.TestCase):
|
||||||
# pylint: disable=R0201
|
# pylint: disable=R0201
|
||||||
def test_in_out(self):
|
def test_in_out(self):
|
||||||
|
@ -65,6 +67,7 @@ class ResNetSpeakerEncoderTests(unittest.TestCase):
|
||||||
assert output.shape[1] == 256
|
assert output.shape[1] == 256
|
||||||
assert len(output.shape) == 2
|
assert len(output.shape) == 2
|
||||||
|
|
||||||
|
|
||||||
class GE2ELossTests(unittest.TestCase):
|
class GE2ELossTests(unittest.TestCase):
|
||||||
# pylint: disable=R0201
|
# pylint: disable=R0201
|
||||||
def test_in_out(self):
|
def test_in_out(self):
|
||||||
|
@ -92,6 +95,7 @@ class GE2ELossTests(unittest.TestCase):
|
||||||
output = loss.forward(dummy_input)
|
output = loss.forward(dummy_input)
|
||||||
assert output.item() < 0.005
|
assert output.item() < 0.005
|
||||||
|
|
||||||
|
|
||||||
class AngleProtoLossTests(unittest.TestCase):
|
class AngleProtoLossTests(unittest.TestCase):
|
||||||
# pylint: disable=R0201
|
# pylint: disable=R0201
|
||||||
def test_in_out(self):
|
def test_in_out(self):
|
||||||
|
@ -121,6 +125,7 @@ class AngleProtoLossTests(unittest.TestCase):
|
||||||
output = loss.forward(dummy_input)
|
output = loss.forward(dummy_input)
|
||||||
assert output.item() < 0.005
|
assert output.item() < 0.005
|
||||||
|
|
||||||
|
|
||||||
class SoftmaxAngleProtoLossTests(unittest.TestCase):
|
class SoftmaxAngleProtoLossTests(unittest.TestCase):
|
||||||
# pylint: disable=R0201
|
# pylint: disable=R0201
|
||||||
def test_in_out(self):
|
def test_in_out(self):
|
||||||
|
|
|
@ -46,7 +46,7 @@ run_cli(command_train)
|
||||||
shutil.rmtree(continue_path)
|
shutil.rmtree(continue_path)
|
||||||
|
|
||||||
# test resnet speaker encoder
|
# test resnet speaker encoder
|
||||||
config.model_params['model_name'] = "resnet"
|
config.model_params["model_name"] = "resnet"
|
||||||
config.save_json(config_path)
|
config.save_json(config_path)
|
||||||
|
|
||||||
# train the model for one epoch
|
# train the model for one epoch
|
||||||
|
|
|
@ -0,0 +1,24 @@
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
|
||||||
|
|
||||||
|
_TEST_CASES = """
|
||||||
|
どちらに行きますか?/dochiraniikimasuka?
|
||||||
|
今日は温泉に、行きます。/kyo:waoNseNni,ikimasu.
|
||||||
|
「A」から「Z」までです。/AkaraZmadedesu.
|
||||||
|
そうですね!/so:desune!
|
||||||
|
クジラは哺乳類です。/kujirawahonyu:ruidesu.
|
||||||
|
ヴィディオを見ます。/bidioomimasu.
|
||||||
|
ky o: w a o N s e N n i , i k i m a s u ./kyo:waoNseNni,ikimasu.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class TestText(unittest.TestCase):
|
||||||
|
def test_japanese_text_to_phonemes(self):
|
||||||
|
for line in _TEST_CASES.strip().split("\n"):
|
||||||
|
text, phone = line.split("/")
|
||||||
|
self.assertEqual(japanese_text_to_phonemes(text), phone)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
|
@ -17,7 +17,7 @@ config = GlowTTSConfig(
|
||||||
text_cleaner="english_cleaners",
|
text_cleaner="english_cleaners",
|
||||||
use_phonemes=True,
|
use_phonemes=True,
|
||||||
phoneme_language="zh-CN",
|
phoneme_language="zh-CN",
|
||||||
phoneme_cache_path='tests/data/ljspeech/phoneme_cache/',
|
phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
|
||||||
run_eval=True,
|
run_eval=True,
|
||||||
test_delay_epochs=-1,
|
test_delay_epochs=-1,
|
||||||
epochs=1,
|
epochs=1,
|
||||||
|
|
|
@ -17,7 +17,7 @@ config = SpeedySpeechConfig(
|
||||||
text_cleaner="english_cleaners",
|
text_cleaner="english_cleaners",
|
||||||
use_phonemes=True,
|
use_phonemes=True,
|
||||||
phoneme_language="zh-CN",
|
phoneme_language="zh-CN",
|
||||||
phoneme_cache_path='tests/data/ljspeech/phoneme_cache/',
|
phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
|
||||||
run_eval=True,
|
run_eval=True,
|
||||||
test_delay_epochs=-1,
|
test_delay_epochs=-1,
|
||||||
epochs=1,
|
epochs=1,
|
||||||
|
|
|
@ -20,6 +20,7 @@ config = FullbandMelganConfig(
|
||||||
eval_split_size=1,
|
eval_split_size=1,
|
||||||
print_step=1,
|
print_step=1,
|
||||||
print_eval=True,
|
print_eval=True,
|
||||||
|
discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]},
|
||||||
data_path="tests/data/ljspeech",
|
data_path="tests/data/ljspeech",
|
||||||
output_path=output_path,
|
output_path=output_path,
|
||||||
)
|
)
|
||||||
|
|
|
@ -19,6 +19,7 @@ config = MelganConfig(
|
||||||
seq_len=2048,
|
seq_len=2048,
|
||||||
eval_split_size=1,
|
eval_split_size=1,
|
||||||
print_step=1,
|
print_step=1,
|
||||||
|
discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]},
|
||||||
print_eval=True,
|
print_eval=True,
|
||||||
data_path="tests/data/ljspeech",
|
data_path="tests/data/ljspeech",
|
||||||
output_path=output_path,
|
output_path=output_path,
|
||||||
|
|
Loading…
Reference in New Issue