mirror of https://github.com/coqui-ai/TTS.git
Merge pull request #135 from idiap/mas
Use external package for monotonic alignment search
This commit is contained in:
commit
e18f7da973
|
@ -7,8 +7,7 @@ defaults:
|
|||
shell:
|
||||
bash
|
||||
jobs:
|
||||
build-sdist:
|
||||
name: Build source distribution
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
@ -20,37 +19,29 @@ jobs:
|
|||
if [[ "v$version" != "$tag" ]]; then
|
||||
exit 1
|
||||
fi
|
||||
- uses: actions/setup-python@v5
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v3
|
||||
with:
|
||||
python-version: 3.9
|
||||
- run: |
|
||||
python -m pip install -U pip setuptools build
|
||||
- run: |
|
||||
python -m build
|
||||
- run: |
|
||||
pip install dist/*.tar.gz
|
||||
version: "0.4.27"
|
||||
enable-cache: true
|
||||
cache-dependency-glob: "**/pyproject.toml"
|
||||
- name: Set up Python
|
||||
run: uv python install 3.12
|
||||
- name: Build sdist and wheel
|
||||
run: uv build
|
||||
- name: Test installation of sdist and wheel
|
||||
run: |
|
||||
uv venv --no-project
|
||||
uv pip install dist/*.tar.gz
|
||||
uv pip install dist/*.whl
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: build-sdist
|
||||
path: dist/*.tar.gz
|
||||
build-wheels:
|
||||
name: Build wheels on ${{ matrix.os }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, windows-latest, macos-latest]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Build wheels
|
||||
uses: pypa/cibuildwheel@v2.21.1
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: build-wheels-${{ matrix.os }}
|
||||
path: ./wheelhouse/*.whl
|
||||
name: build
|
||||
path: dist/*
|
||||
publish-artifacts:
|
||||
name: Publish to PyPI
|
||||
runs-on: ubuntu-latest
|
||||
needs: [build-sdist, build-wheels]
|
||||
needs: [build]
|
||||
environment:
|
||||
name: release
|
||||
url: https://pypi.org/p/coqui-tts
|
||||
|
@ -60,8 +51,7 @@ jobs:
|
|||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: dist
|
||||
pattern: build-*
|
||||
merge-multiple: true
|
||||
pattern: build
|
||||
- run: |
|
||||
ls -lh dist/
|
||||
- name: Publish package distributions to PyPI
|
||||
|
|
|
@ -1,75 +0,0 @@
|
|||
TTS code owners / governance system
|
||||
==========================================
|
||||
|
||||
TTS is run under a governance system inspired (and partially copied from) by the `Mozilla module ownership system <https://www.mozilla.org/about/governance/policies/module-ownership/>`_. The project is roughly divided into modules, and each module has its owners, which are responsible for reviewing pull requests and deciding on technical direction for their modules. Module ownership authority is given to people who have worked extensively on areas of the project.
|
||||
|
||||
Module owners also have the authority of naming other module owners or appointing module peers, which are people with authority to review pull requests in that module. They can also sub-divide their module into sub-modules with their owners.
|
||||
|
||||
Module owners are not tyrants. They are chartered to make decisions with input from the community and in the best interest of the community. Module owners are not required to make code changes or additions solely because the community wants them to do so. (Like anyone else, the module owners may write code because they want to, because their employers want them to, because the community wants them to, or for some other reason.) Module owners do need to pay attention to patches submitted to that module. However “pay attention” does not mean agreeing to every patch. Some patches may not make sense for the WebThings project; some may be poorly implemented. Module owners have the authority to decline a patch; this is a necessary part of the role. We ask the module owners to describe in the relevant issue their reasons for wanting changes to a patch, for declining it altogether, or for postponing review for some period. We don’t ask or expect them to rewrite patches to make them acceptable. Similarly, module owners may need to delay review of a promising patch due to an upcoming deadline. For example, a patch may be of interest, but not for the next milestone. In such a case it may make sense for the module owner to postpone review of a patch until after matters needed for a milestone have been finalized. Again, we expect this to be described in the relevant issue. And of course, it shouldn’t go on very often or for very long or escalation and review is likely.
|
||||
|
||||
The work of the various module owners and peers is overseen by the global owners, which are responsible for making final decisions in case there's conflict between owners as well as set the direction for the project as a whole.
|
||||
|
||||
This file describes module owners who are active on the project and which parts of the code they have expertise on (and interest in). If you're making changes to the code and are wondering who's an appropriate person to talk to, this list will tell you who to ping.
|
||||
|
||||
There's overlap in the areas of expertise of each owner, and in particular when looking at which files are covered by each area, there is a lot of overlap. Don't worry about getting it exactly right when requesting review, any code owner will be happy to redirect the request to a more appropriate person.
|
||||
|
||||
Global owners
|
||||
----------------
|
||||
|
||||
These are people who have worked on the project extensively and are familiar with all or most parts of it. Their expertise and review guidance is trusted by other code owners to cover their own areas of expertise. In case of conflicting opinions from other owners, global owners will make a final decision.
|
||||
|
||||
- Eren Gölge (@erogol)
|
||||
- Reuben Morais (@reuben)
|
||||
|
||||
Training, feeding
|
||||
-----------------
|
||||
|
||||
- Eren Gölge (@erogol)
|
||||
|
||||
Model exporting
|
||||
---------------
|
||||
|
||||
- Eren Gölge (@erogol)
|
||||
|
||||
Multi-Speaker TTS
|
||||
-----------------
|
||||
|
||||
- Eren Gölge (@erogol)
|
||||
- Edresson Casanova (@edresson)
|
||||
|
||||
TTS
|
||||
---
|
||||
|
||||
- Eren Gölge (@erogol)
|
||||
|
||||
Vocoders
|
||||
--------
|
||||
|
||||
- Eren Gölge (@erogol)
|
||||
|
||||
Speaker Encoder
|
||||
---------------
|
||||
|
||||
- Eren Gölge (@erogol)
|
||||
|
||||
Testing & CI
|
||||
------------
|
||||
|
||||
- Eren Gölge (@erogol)
|
||||
- Reuben Morais (@reuben)
|
||||
|
||||
Python bindings
|
||||
---------------
|
||||
|
||||
- Eren Gölge (@erogol)
|
||||
- Reuben Morais (@reuben)
|
||||
|
||||
Documentation
|
||||
-------------
|
||||
|
||||
- Eren Gölge (@erogol)
|
||||
|
||||
Third party bindings
|
||||
--------------------
|
||||
|
||||
Owned by the author.
|
10
MANIFEST.in
10
MANIFEST.in
|
@ -1,10 +0,0 @@
|
|||
include README.md
|
||||
include LICENSE.txt
|
||||
include *.cff
|
||||
recursive-include TTS *.json
|
||||
recursive-include TTS *.html
|
||||
recursive-include TTS *.png
|
||||
recursive-include TTS *.md
|
||||
recursive-include TTS *.py
|
||||
recursive-include TTS *.pyx
|
||||
recursive-include images *.png
|
|
@ -5,6 +5,7 @@ from typing import Callable, Dict, Tuple
|
|||
import torch
|
||||
import torch.nn.functional as F
|
||||
from coqpit import Coqpit
|
||||
from monotonic_alignment_search import maximum_path
|
||||
from torch import nn
|
||||
|
||||
from TTS.tts.layers.delightful_tts.conformer import Conformer
|
||||
|
@ -19,7 +20,7 @@ from TTS.tts.layers.delightful_tts.phoneme_prosody_predictor import PhonemeProso
|
|||
from TTS.tts.layers.delightful_tts.pitch_adaptor import PitchAdaptor
|
||||
from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor
|
||||
from TTS.tts.layers.generic.aligner import AlignmentNetwork
|
||||
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
|
||||
from TTS.tts.utils.helpers import generate_path, sequence_mask
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ from typing import Dict, List, Union
|
|||
|
||||
import torch
|
||||
from coqpit import Coqpit
|
||||
from monotonic_alignment_search import maximum_path
|
||||
from torch import nn
|
||||
from trainer.io import load_fsspec
|
||||
|
||||
|
@ -12,7 +13,7 @@ from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor
|
|||
from TTS.tts.layers.feed_forward.encoder import Encoder
|
||||
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
|
||||
from TTS.tts.models.base_tts import BaseTTS
|
||||
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
|
||||
from TTS.tts.utils.helpers import generate_path, sequence_mask
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||
|
|
|
@ -4,6 +4,7 @@ from typing import Dict, List, Tuple, Union
|
|||
|
||||
import torch
|
||||
from coqpit import Coqpit
|
||||
from monotonic_alignment_search import maximum_path
|
||||
from torch import nn
|
||||
from torch.cuda.amp.autocast_mode import autocast
|
||||
from trainer.io import load_fsspec
|
||||
|
@ -14,7 +15,7 @@ from TTS.tts.layers.generic.aligner import AlignmentNetwork
|
|||
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
|
||||
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
|
||||
from TTS.tts.models.base_tts import BaseTTS
|
||||
from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum_path, sequence_mask
|
||||
from TTS.tts.utils.helpers import average_over_durations, generate_path, sequence_mask
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.tts.utils.visual import plot_alignment, plot_avg_energy, plot_avg_pitch, plot_spectrogram
|
||||
|
|
|
@ -4,6 +4,7 @@ from typing import Dict, List, Tuple, Union
|
|||
|
||||
import torch
|
||||
from coqpit import Coqpit
|
||||
from monotonic_alignment_search import maximum_path
|
||||
from torch import nn
|
||||
from torch.cuda.amp.autocast_mode import autocast
|
||||
from torch.nn import functional as F
|
||||
|
@ -13,7 +14,7 @@ from TTS.tts.configs.glow_tts_config import GlowTTSConfig
|
|||
from TTS.tts.layers.glow_tts.decoder import Decoder
|
||||
from TTS.tts.layers.glow_tts.encoder import Encoder
|
||||
from TTS.tts.models.base_tts import BaseTTS
|
||||
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
|
||||
from TTS.tts.utils.helpers import generate_path, sequence_mask
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
from TTS.tts.utils.synthesis import synthesis
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
|
|
|
@ -11,6 +11,7 @@ import torch.distributed as dist
|
|||
import torchaudio
|
||||
from coqpit import Coqpit
|
||||
from librosa.filters import mel as librosa_mel_fn
|
||||
from monotonic_alignment_search import maximum_path
|
||||
from torch import nn
|
||||
from torch.cuda.amp.autocast_mode import autocast
|
||||
from torch.nn import functional as F
|
||||
|
@ -28,7 +29,7 @@ from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlock
|
|||
from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor
|
||||
from TTS.tts.models.base_tts import BaseTTS
|
||||
from TTS.tts.utils.fairseq import rehash_fairseq_vits_checkpoint
|
||||
from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask
|
||||
from TTS.tts.utils.helpers import generate_path, rand_segments, segment, sequence_mask
|
||||
from TTS.tts.utils.languages import LanguageManager
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
from TTS.tts.utils.synthesis import synthesis
|
||||
|
|
|
@ -3,13 +3,6 @@ import torch
|
|||
from scipy.stats import betabinom
|
||||
from torch.nn import functional as F
|
||||
|
||||
try:
|
||||
from TTS.tts.utils.monotonic_align.core import maximum_path_c
|
||||
|
||||
CYTHON = True
|
||||
except ModuleNotFoundError:
|
||||
CYTHON = False
|
||||
|
||||
|
||||
class StandardScaler:
|
||||
"""StandardScaler for mean-scale normalization with the given mean and scale values."""
|
||||
|
@ -168,73 +161,6 @@ def generate_path(duration, mask):
|
|||
return path
|
||||
|
||||
|
||||
def maximum_path(value, mask):
|
||||
if CYTHON:
|
||||
return maximum_path_cython(value, mask)
|
||||
return maximum_path_numpy(value, mask)
|
||||
|
||||
|
||||
def maximum_path_cython(value, mask):
|
||||
"""Cython optimised version.
|
||||
Shapes:
|
||||
- value: :math:`[B, T_en, T_de]`
|
||||
- mask: :math:`[B, T_en, T_de]`
|
||||
"""
|
||||
value = value * mask
|
||||
device = value.device
|
||||
dtype = value.dtype
|
||||
value = value.data.cpu().numpy().astype(np.float32)
|
||||
path = np.zeros_like(value).astype(np.int32)
|
||||
mask = mask.data.cpu().numpy()
|
||||
|
||||
t_x_max = mask.sum(1)[:, 0].astype(np.int32)
|
||||
t_y_max = mask.sum(2)[:, 0].astype(np.int32)
|
||||
maximum_path_c(path, value, t_x_max, t_y_max)
|
||||
return torch.from_numpy(path).to(device=device, dtype=dtype)
|
||||
|
||||
|
||||
def maximum_path_numpy(value, mask, max_neg_val=None):
|
||||
"""
|
||||
Monotonic alignment search algorithm
|
||||
Numpy-friendly version. It's about 4 times faster than torch version.
|
||||
value: [b, t_x, t_y]
|
||||
mask: [b, t_x, t_y]
|
||||
"""
|
||||
if max_neg_val is None:
|
||||
max_neg_val = -np.inf # Patch for Sphinx complaint
|
||||
value = value * mask
|
||||
|
||||
device = value.device
|
||||
dtype = value.dtype
|
||||
value = value.cpu().detach().numpy()
|
||||
mask = mask.cpu().detach().numpy().astype(bool)
|
||||
|
||||
b, t_x, t_y = value.shape
|
||||
direction = np.zeros(value.shape, dtype=np.int64)
|
||||
v = np.zeros((b, t_x), dtype=np.float32)
|
||||
x_range = np.arange(t_x, dtype=np.float32).reshape(1, -1)
|
||||
for j in range(t_y):
|
||||
v0 = np.pad(v, [[0, 0], [1, 0]], mode="constant", constant_values=max_neg_val)[:, :-1]
|
||||
v1 = v
|
||||
max_mask = v1 >= v0
|
||||
v_max = np.where(max_mask, v1, v0)
|
||||
direction[:, :, j] = max_mask
|
||||
|
||||
index_mask = x_range <= j
|
||||
v = np.where(index_mask, v_max + value[:, :, j], max_neg_val)
|
||||
direction = np.where(mask, direction, 1)
|
||||
|
||||
path = np.zeros(value.shape, dtype=np.float32)
|
||||
index = mask[:, :, 0].sum(1).astype(np.int64) - 1
|
||||
index_range = np.arange(b)
|
||||
for j in reversed(range(t_y)):
|
||||
path[index_range, index, j] = 1
|
||||
index = index + direction[index_range, index, j] - 1
|
||||
path = path * mask.astype(np.float32)
|
||||
path = torch.from_numpy(path).to(device=device, dtype=dtype)
|
||||
return path
|
||||
|
||||
|
||||
def beta_binomial_prior_distribution(phoneme_count, mel_count, scaling_factor=1.0):
|
||||
P, M = phoneme_count, mel_count
|
||||
x = np.arange(0, P)
|
||||
|
|
|
@ -1,47 +0,0 @@
|
|||
import numpy as np
|
||||
|
||||
cimport cython
|
||||
cimport numpy as np
|
||||
|
||||
from cython.parallel import prange
|
||||
|
||||
|
||||
@cython.boundscheck(False)
|
||||
@cython.wraparound(False)
|
||||
cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil:
|
||||
cdef int x
|
||||
cdef int y
|
||||
cdef float v_prev
|
||||
cdef float v_cur
|
||||
cdef float tmp
|
||||
cdef int index = t_x - 1
|
||||
|
||||
for y in range(t_y):
|
||||
for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
|
||||
if x == y:
|
||||
v_cur = max_neg_val
|
||||
else:
|
||||
v_cur = value[x, y-1]
|
||||
if x == 0:
|
||||
if y == 0:
|
||||
v_prev = 0.
|
||||
else:
|
||||
v_prev = max_neg_val
|
||||
else:
|
||||
v_prev = value[x-1, y-1]
|
||||
value[x, y] = max(v_cur, v_prev) + value[x, y]
|
||||
|
||||
for y in range(t_y - 1, -1, -1):
|
||||
path[index, y] = 1
|
||||
if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]):
|
||||
index = index - 1
|
||||
|
||||
|
||||
@cython.boundscheck(False)
|
||||
@cython.wraparound(False)
|
||||
cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil:
|
||||
cdef int b = values.shape[0]
|
||||
|
||||
cdef int i
|
||||
for i in prange(b, nogil=True):
|
||||
maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val)
|
|
@ -1,14 +1,27 @@
|
|||
[build-system]
|
||||
requires = [
|
||||
"setuptools",
|
||||
"setuptools-scm",
|
||||
"cython>=3.0.0",
|
||||
"numpy>=2.0.0",
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
# ,*++++++*, ,*++++++*,
|
||||
# *++. .+++ *++. .++*
|
||||
# *+* ,++++* *+* *+* ,++++, *+*
|
||||
# ,+, .++++++++++* ,++,,,,*+, ,++++++++++. *+,
|
||||
# *+. .++++++++++++..++ *+.,++++++++++++. .+*
|
||||
# .+* ++++++++++++.*+, .+*.++++++++++++ *+,
|
||||
# .++ *++++++++* ++, .++.*++++++++* ++,
|
||||
# ,+++*. . .*++, ,++*. .*+++*
|
||||
# *+, .,*++**. .**++**. ,+*
|
||||
# .+* *+,
|
||||
# *+. Coqui .+*
|
||||
# *+* +++ TTS +++ *+*
|
||||
# .+++*. . . *+++.
|
||||
# ,+* *+++*... ...*+++* *+,
|
||||
# .++. .""""+++++++****+++++++"""". ++.
|
||||
# ,++. .++,
|
||||
# .++* *++.
|
||||
# *+++, ,+++*
|
||||
# .,*++++::::::++++*,.
|
||||
# ``````
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
include = ["TTS*"]
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "coqui-tts"
|
||||
|
@ -64,6 +77,7 @@ dependencies = [
|
|||
# Coqui stack
|
||||
"coqui-tts-trainer>=0.1.4,<0.2.0",
|
||||
"coqpit>=0.0.16",
|
||||
"monotonic-alignment-search>=0.1.0",
|
||||
# Gruut + supported languages
|
||||
"gruut[de,es,fr]>=2.4.0",
|
||||
# Tortoise
|
||||
|
@ -151,6 +165,22 @@ tts-server = "TTS.server.server:main"
|
|||
[tool.uv]
|
||||
constraint-dependencies = ["numba>0.58.0"]
|
||||
|
||||
[tool.hatch.build]
|
||||
exclude = [
|
||||
"/.github",
|
||||
"/.gitignore",
|
||||
"/.pre-commit-config.yaml",
|
||||
"/.readthedocs.yml",
|
||||
"/Makefile",
|
||||
"/dockerfiles",
|
||||
"/run_bash_tests.sh",
|
||||
"/scripts",
|
||||
"/tests",
|
||||
]
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["TTS"]
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 120
|
||||
extend-exclude = ["*.ipynb"]
|
||||
|
|
37
setup.py
37
setup.py
|
@ -1,37 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# ,*++++++*, ,*++++++*,
|
||||
# *++. .+++ *++. .++*
|
||||
# *+* ,++++* *+* *+* ,++++, *+*
|
||||
# ,+, .++++++++++* ,++,,,,*+, ,++++++++++. *+,
|
||||
# *+. .++++++++++++..++ *+.,++++++++++++. .+*
|
||||
# .+* ++++++++++++.*+, .+*.++++++++++++ *+,
|
||||
# .++ *++++++++* ++, .++.*++++++++* ++,
|
||||
# ,+++*. . .*++, ,++*. .*+++*
|
||||
# *+, .,*++**. .**++**. ,+*
|
||||
# .+* *+,
|
||||
# *+. Coqui .+*
|
||||
# *+* +++ TTS +++ *+*
|
||||
# .+++*. . . *+++.
|
||||
# ,+* *+++*... ...*+++* *+,
|
||||
# .++. .""""+++++++****+++++++"""". ++.
|
||||
# ,++. .++,
|
||||
# .++* *++.
|
||||
# *+++, ,+++*
|
||||
# .,*++++::::::++++*,.
|
||||
# ``````
|
||||
|
||||
import numpy
|
||||
from Cython.Build import cythonize
|
||||
from setuptools import Extension, setup
|
||||
|
||||
exts = [
|
||||
Extension(
|
||||
name="TTS.tts.utils.monotonic_align.core",
|
||||
sources=["TTS/tts/utils/monotonic_align/core.pyx"],
|
||||
)
|
||||
]
|
||||
setup(
|
||||
include_dirs=numpy.get_include(),
|
||||
ext_modules=cythonize(exts, language_level=3),
|
||||
zip_safe=False,
|
||||
)
|
Loading…
Reference in New Issue