Merge branch 'p3_11' into dev

This commit is contained in:
Eren G??lge 2023-06-28 12:13:04 +02:00
commit 6b9ebf5aab
32 changed files with 114 additions and 116 deletions

View File

@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3

View File

@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3

View File

@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3

View File

@ -21,7 +21,7 @@ jobs:
fi
- uses: actions/setup-python@v2
with:
python-version: 3.8
python-version: 3.9
- run: |
python -m pip install -U pip setuptools wheel build
- run: |
@ -36,7 +36,7 @@ jobs:
runs-on: ubuntu-20.04
strategy:
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10"]
python-version: ["3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
@ -64,14 +64,6 @@ jobs:
with:
name: "sdist"
path: "dist/"
- uses: actions/download-artifact@v2
with:
name: "wheel-3.7"
path: "dist/"
- uses: actions/download-artifact@v2
with:
name: "wheel-3.8"
path: "dist/"
- uses: actions/download-artifact@v2
with:
name: "wheel-3.9"
@ -80,6 +72,10 @@ jobs:
with:
name: "wheel-3.10"
path: "dist/"
- uses: actions/download-artifact@v2
with:
name: "wheel-3.11"
path: "dist/"
- run: |
ls -lh dist/
- name: Setup PyPI config
@ -91,7 +87,7 @@ jobs:
EOF
- uses: actions/setup-python@v2
with:
python-version: 3.8
python-version: 3.9
- run: |
python -m pip install twine
- run: |

View File

@ -42,6 +42,6 @@ jobs:
run: |
python3 -m pip install .[all]
python3 setup.py egg_info
- name: Lint check
run: |
make lint
# - name: Lint check
# run: |
# make lint

View File

@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3

View File

@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3

View File

@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3

View File

@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3

View File

@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3
@ -43,6 +43,7 @@ jobs:
run: python3 -m pip install --upgrade pip setuptools wheel
- name: Replace scarf urls
run: |
sed -i 's/https:\/\/coqui.gateway.scarf.sh\/hf\/bark\//https:\/\/huggingface.co\/erogol\/bark\/resolve\/main\//g' TTS/.models.json
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
- name: Install TTS
run: |

View File

@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3

View File

@ -23,7 +23,7 @@ colormap = (
[0, 0, 0],
[183, 183, 183],
],
dtype=np.float,
dtype=float,
)
/ 255
)

View File

@ -1,5 +1,5 @@
import os
from dataclasses import dataclass
from dataclasses import dataclass, field
from typing import Dict
from TTS.tts.configs.shared_configs import BaseTTSConfig
@ -46,11 +46,11 @@ class BarkConfig(BaseTTSConfig):
"""
model: str = "bark"
audio: BarkAudioConfig = BarkAudioConfig()
audio: BarkAudioConfig = field(default_factory=BarkAudioConfig)
num_chars: int = 0
semantic_config: GPTConfig = GPTConfig()
fine_config: FineGPTConfig = FineGPTConfig()
coarse_config: GPTConfig = GPTConfig()
semantic_config: GPTConfig = field(default_factory=GPTConfig)
fine_config: FineGPTConfig = field(default_factory=FineGPTConfig)
coarse_config: GPTConfig = field(default_factory=GPTConfig)
CONTEXT_WINDOW_SIZE: int = 1024
SEMANTIC_RATE_HZ: float = 49.9
SEMANTIC_VOCAB_SIZE: int = 10_000

View File

@ -113,7 +113,7 @@ class FastPitchConfig(BaseTTSConfig):
base_model: str = "forward_tts"
# model specific params
model_args: ForwardTTSArgs = ForwardTTSArgs()
model_args: ForwardTTSArgs = field(default_factory=ForwardTTSArgs)
# multi-speaker settings
num_speakers: int = 0

View File

@ -107,7 +107,7 @@ class FastSpeechConfig(BaseTTSConfig):
base_model: str = "forward_tts"
# model specific params
model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False)
model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=False))
# multi-speaker settings
num_speakers: int = 0

View File

@ -123,7 +123,7 @@ class Fastspeech2Config(BaseTTSConfig):
base_model: str = "forward_tts"
# model specific params
model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=True, use_energy=True)
model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=True, use_energy=True))
# multi-speaker settings
num_speakers: int = 0

View File

@ -103,26 +103,28 @@ class SpeedySpeechConfig(BaseTTSConfig):
base_model: str = "forward_tts"
# set model args as SpeedySpeech
model_args: ForwardTTSArgs = ForwardTTSArgs(
use_pitch=False,
encoder_type="residual_conv_bn",
encoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 13,
},
decoder_type="residual_conv_bn",
decoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4, 8] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 17,
},
out_channels=80,
hidden_channels=128,
positional_encoding=True,
detach_duration_predictor=True,
model_args: ForwardTTSArgs = field(
default_factory=lambda: ForwardTTSArgs(
use_pitch=False,
encoder_type="residual_conv_bn",
encoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 13,
},
decoder_type="residual_conv_bn",
decoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4, 8] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 17,
},
out_channels=80,
hidden_channels=128,
positional_encoding=True,
detach_duration_predictor=True,
)
)
# multi-speaker settings

View File

@ -70,7 +70,7 @@ class TortoiseConfig(BaseTTSConfig):
model: str = "tortoise"
# model specific params
model_args: TortoiseArgs = field(default_factory=TortoiseArgs)
audio: TortoiseAudioConfig = TortoiseAudioConfig()
audio: TortoiseAudioConfig = field(default_factory=TortoiseAudioConfig)
model_dir: str = None
# settings

View File

@ -10,15 +10,11 @@ License: MIT
import logging
from pathlib import Path
import fairseq
import torch
from einops import pack, unpack
from torch import nn
from torchaudio.functional import resample
logging.root.setLevel(logging.ERROR)
from transformers import HubertModel
def round_down_nearest_multiple(num, divisor):
return num // divisor * divisor
@ -49,22 +45,11 @@ class CustomHubert(nn.Module):
self.target_sample_hz = target_sample_hz
self.seq_len_multiple_of = seq_len_multiple_of
self.output_layer = output_layer
if device is not None:
self.to(device)
model_path = Path(checkpoint_path)
assert model_path.exists(), f"path {checkpoint_path} does not exist"
checkpoint = torch.load(checkpoint_path)
load_model_input = {checkpoint_path: checkpoint}
model, *_ = fairseq.checkpoint_utils.load_model_ensemble_and_task(load_model_input)
self.model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
if device is not None:
model[0].to(device)
self.model = model[0]
self.model.to(device)
self.model.eval()
@property
@ -81,19 +66,13 @@ class CustomHubert(nn.Module):
if exists(self.seq_len_multiple_of):
wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of)
embed = self.model(
outputs = self.model.forward(
wav_input,
features_only=True,
mask=False, # thanks to @maitycyrus for noticing that mask is defaulted to True in the fairseq code
output_layer=self.output_layer,
output_hidden_states=True,
)
embed, packed_shape = pack([embed["x"]], "* d")
# codebook_indices = self.kmeans.predict(embed.cpu().detach().numpy())
codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device) # .long()
embed = outputs["hidden_states"][self.output_layer]
embed, packed_shape = pack([embed], "* d")
codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device)
if flatten:
return codebook_indices

View File

@ -130,7 +130,7 @@ def generate_voice(
# generate semantic tokens
# Load the HuBERT model
hubert_manager = HubertManager()
hubert_manager.make_sure_hubert_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert"])
# hubert_manager.make_sure_hubert_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert"])
hubert_manager.make_sure_tokenizer_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert_tokenizer"])
hubert_model = CustomHubert(checkpoint_path=model.config.LOCAL_MODEL_PATHS["hubert"]).to(model.device)

View File

@ -165,7 +165,7 @@ class BCELossMasked(nn.Module):
def __init__(self, pos_weight: float = None):
super().__init__()
self.pos_weight = nn.Parameter(torch.tensor([pos_weight]), requires_grad=False)
self.register_buffer("pos_weight", torch.tensor([pos_weight]))
def forward(self, x, target, length):
"""
@ -191,10 +191,15 @@ class BCELossMasked(nn.Module):
mask = sequence_mask(sequence_length=length, max_len=target.size(1))
num_items = mask.sum()
loss = functional.binary_cross_entropy_with_logits(
x.masked_select(mask), target.masked_select(mask), pos_weight=self.pos_weight, reduction="sum"
x.masked_select(mask),
target.masked_select(mask),
pos_weight=self.pos_weight.to(x.device),
reduction="sum",
)
else:
loss = functional.binary_cross_entropy_with_logits(x, target, pos_weight=self.pos_weight, reduction="sum")
loss = functional.binary_cross_entropy_with_logits(
x, target, pos_weight=self.pos_weight.to(x.device), reduction="sum"
)
num_items = torch.numel(x)
loss = loss / num_items
return loss

View File

@ -207,7 +207,7 @@ def maximum_path_numpy(value, mask, max_neg_val=None):
device = value.device
dtype = value.dtype
value = value.cpu().detach().numpy()
mask = mask.cpu().detach().numpy().astype(np.bool)
mask = mask.cpu().detach().numpy().astype(bool)
b, t_x, t_y = value.shape
direction = np.zeros(value.shape, dtype=np.int64)

View File

@ -540,7 +540,10 @@ class AudioProcessor(object):
def _griffin_lim(self, S):
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
try:
S_complex = np.abs(S).astype(np.complex)
except AttributeError: # np.complex is deprecated since numpy 1.20.0
S_complex = np.abs(S).astype(complex)
y = self._istft(S_complex * angles)
if not np.isfinite(y).all():
print(" [!] Waveform is not finite everywhere. Skipping the GL.")

View File

@ -264,12 +264,15 @@ class ModelManager(object):
model_download_uri = os.path.join(URI_PREFIX, f"{lang}.tar.gz")
self._download_tar_file(model_download_uri, output_path, self.progress_bar)
def set_model_url(self, model_item: Dict):
@staticmethod
def set_model_url(model_item: Dict):
model_item["model_url"] = None
if "github_rls_url" in model_item:
model_item["model_url"] = model_item["github_rls_url"]
elif "hf_url" in model_item:
model_item["model_url"] = model_item["hf_url"]
elif "fairseq" in model_item["model_name"]:
model_item["model_url"] = "https://coqui.gateway.scarf.sh/fairseq/"
return model_item
def _set_model_item(self, model_name):
@ -285,10 +288,12 @@ class ModelManager(object):
"author": "fairseq",
"description": "this model is released by Meta under Fairseq repo. Visit https://github.com/facebookresearch/fairseq/tree/main/examples/mms for more info.",
}
model_item["model_name"] = model_name
else:
# get model from models.json
model_item = self.models_dict[model_type][lang][dataset][model]
model_item["model_type"] = model_type
model_item = self.set_model_url(model_item)
return model_item, model_full_name, model
def download_model(self, model_name):
@ -324,7 +329,9 @@ class ModelManager(object):
# find downloaded files
output_model_path = output_path
output_config_path = None
if model not in ["tortoise-v2", "bark"] and "fairseq" not in model_name: # TODO:This is stupid but don't care for now.
if (
model not in ["tortoise-v2", "bark"] and "fairseq" not in model_name
): # TODO:This is stupid but don't care for now.
output_model_path, output_config_path = self._find_files(output_path)
# update paths in the config.json
self._update_paths(output_path, output_config_path)

View File

@ -794,8 +794,8 @@ class FreeVCConfig(BaseVCConfig):
model: str = "freevc"
# model specific params
model_args: FreeVCArgs = FreeVCArgs()
audio: FreeVCAudioConfig = FreeVCAudioConfig()
model_args: FreeVCArgs = field(default_factory=FreeVCArgs)
audio: FreeVCAudioConfig = field(default_factory=FreeVCAudioConfig)
# optimizer
# TODO with training support

View File

@ -1,5 +1,5 @@
[build-system]
requires = ["setuptools", "wheel", "cython==0.29.28", "numpy==1.21.6", "packaging"]
requires = ["setuptools", "wheel", "cython==0.29.30", "numpy==1.22.0", "packaging"]
[flake8]
max-line-length=120

View File

@ -1,14 +1,14 @@
# core deps
numpy==1.21.6;python_version<"3.10"
numpy;python_version=="3.10"
cython==0.29.28
numpy==1.22.0;python_version<="3.10"
numpy==1.24.3;python_version>"3.10"
cython==0.29.30
scipy>=1.4.0
torch>=1.7
torchaudio
soundfile
librosa==0.10.0.*
numba==0.55.1;python_version<"3.9"
numba==0.56.4;python_version>="3.9"
numba==0.57.0;python_version>="3.9"
inflect==5.6.0
tqdm
anyascii
@ -26,14 +26,14 @@ pandas
# deps for training
matplotlib
# coqui stack
trainer==0.0.20
trainer
# config management
coqpit>=0.0.16
# chinese g2p deps
jieba
pypinyin
# japanese g2p deps
mecab-python3==1.0.5
mecab-python3==1.0.6
unidic-lite==1.0.8
# gruut+supported langs
gruut[de,es,fr]==2.2.3
@ -51,5 +51,3 @@ einops
transformers
#deps for bark
encodec
#deps for fairseq models
fairseq

View File

@ -1,8 +1,8 @@
[build_py]
build-lib=temp_build
build_lib=temp_build
[bdist_wheel]
bdist-dir=temp_build
bdist_dir=temp_build
[install_lib]
build-dir=temp_build
build_dir=temp_build

View File

@ -32,8 +32,8 @@ from Cython.Build import cythonize
from setuptools import Extension, find_packages, setup
python_version = sys.version.split()[0]
if Version(python_version) < Version("3.7") or Version(python_version) >= Version("3.11"):
raise RuntimeError("TTS requires python >= 3.7 and < 3.11 " "but your Python version is {}".format(sys.version))
if Version(python_version) < Version("3.9") or Version(python_version) >= Version("3.12"):
raise RuntimeError("TTS requires python >= 3.9 and < 3.12 " "but your Python version is {}".format(sys.version))
cwd = os.path.dirname(os.path.abspath(__file__))
@ -114,15 +114,14 @@ setup(
"dev": requirements_dev,
"notebooks": requirements_notebooks,
},
python_requires=">=3.7.0, <3.11",
python_requires=">=3.9.0, <3.12",
entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]},
classifiers=[
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Development Status :: 3 - Alpha",
"Intended Audience :: Science/Research",
"Intended Audience :: Developers",

View File

@ -1,5 +1,5 @@
import unittest
from dataclasses import dataclass
from dataclasses import dataclass, field
from coqpit import Coqpit
@ -86,11 +86,11 @@ class TestTTSTokenizer(unittest.TestCase):
enable_eos_bos_chars: bool = True
use_phonemes: bool = True
add_blank: bool = False
characters: str = Characters()
characters: str = field(default_factory=Characters)
phonemizer: str = "espeak"
phoneme_language: str = "tr"
text_cleaner: str = "phoneme_cleaners"
characters = Characters()
characters = field(default_factory=Characters)
tokenizer_ph, _ = TTSTokenizer.init_from_config(TokenizerConfig())
tokenizer_ph.phonemizer.backend = "espeak"

View File

@ -16,7 +16,7 @@ from TTS.utils.audio import AudioProcessor
torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda" if use_cuda else "cpu")
config_global = TacotronConfig(num_chars=32, num_speakers=5, out_channels=513, decoder_output_dim=80)
@ -288,7 +288,6 @@ class TacotronCapacitronTrainTest(unittest.TestCase):
batch["text_input"].shape[0], batch["stop_targets"].size(1) // config.r, -1
)
batch["stop_targets"] = (batch["stop_targets"].sum(2) > 0.0).unsqueeze(2).float().squeeze()
model = Tacotron(config).to(device)
criterion = model.get_criterion()
optimizer = model.get_optimizer()

View File

@ -15,7 +15,7 @@ def run_models(offset=0, step=1):
print(" > Run synthesizer with all the models.")
output_path = os.path.join(get_tests_output_path(), "output.wav")
manager = ModelManager(output_prefix=get_tests_output_path(), progress_bar=False)
model_names = manager.list_models()
model_names = [name for name in manager.list_models() if "bark" not in name]
for model_name in model_names[offset::step]:
print(f"\n > Run - {model_name}")
model_path, _, _ = manager.download_model(model_name)
@ -79,6 +79,15 @@ def test_models_offset_2_step_3():
run_models(offset=2, step=3)
def test_bark():
"""Bark is too big to run on github actions. We need to test it locally"""
output_path = os.path.join(get_tests_output_path(), "output.wav")
run_cli(
f" tts --model_name tts_models/multilingual/multi-dataset/bark "
f'--text "This is an example." --out_path "{output_path}" --progress_bar False'
)
def test_voice_conversion():
print(" > Run voice conversion inference using YourTTS model.")
model_name = "tts_models/multilingual/multi-dataset/your_tts"