mirror of https://github.com/coqui-ai/TTS.git
Merge branch 'p3_11' into dev
This commit is contained in:
commit
6b9ebf5aab
|
@ -18,7 +18,7 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
python-version: [3.9, "3.10", "3.11"]
|
||||||
experimental: [false]
|
experimental: [false]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
|
|
@ -18,7 +18,7 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
python-version: [3.9, "3.10", "3.11"]
|
||||||
experimental: [false]
|
experimental: [false]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
|
|
@ -18,7 +18,7 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
python-version: [3.9, "3.10", "3.11"]
|
||||||
experimental: [false]
|
experimental: [false]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
|
|
@ -21,7 +21,7 @@ jobs:
|
||||||
fi
|
fi
|
||||||
- uses: actions/setup-python@v2
|
- uses: actions/setup-python@v2
|
||||||
with:
|
with:
|
||||||
python-version: 3.8
|
python-version: 3.9
|
||||||
- run: |
|
- run: |
|
||||||
python -m pip install -U pip setuptools wheel build
|
python -m pip install -U pip setuptools wheel build
|
||||||
- run: |
|
- run: |
|
||||||
|
@ -36,7 +36,7 @@ jobs:
|
||||||
runs-on: ubuntu-20.04
|
runs-on: ubuntu-20.04
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.7", "3.8", "3.9", "3.10"]
|
python-version: ["3.9", "3.10", "3.11"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
- uses: actions/setup-python@v2
|
- uses: actions/setup-python@v2
|
||||||
|
@ -64,14 +64,6 @@ jobs:
|
||||||
with:
|
with:
|
||||||
name: "sdist"
|
name: "sdist"
|
||||||
path: "dist/"
|
path: "dist/"
|
||||||
- uses: actions/download-artifact@v2
|
|
||||||
with:
|
|
||||||
name: "wheel-3.7"
|
|
||||||
path: "dist/"
|
|
||||||
- uses: actions/download-artifact@v2
|
|
||||||
with:
|
|
||||||
name: "wheel-3.8"
|
|
||||||
path: "dist/"
|
|
||||||
- uses: actions/download-artifact@v2
|
- uses: actions/download-artifact@v2
|
||||||
with:
|
with:
|
||||||
name: "wheel-3.9"
|
name: "wheel-3.9"
|
||||||
|
@ -80,6 +72,10 @@ jobs:
|
||||||
with:
|
with:
|
||||||
name: "wheel-3.10"
|
name: "wheel-3.10"
|
||||||
path: "dist/"
|
path: "dist/"
|
||||||
|
- uses: actions/download-artifact@v2
|
||||||
|
with:
|
||||||
|
name: "wheel-3.11"
|
||||||
|
path: "dist/"
|
||||||
- run: |
|
- run: |
|
||||||
ls -lh dist/
|
ls -lh dist/
|
||||||
- name: Setup PyPI config
|
- name: Setup PyPI config
|
||||||
|
@ -91,7 +87,7 @@ jobs:
|
||||||
EOF
|
EOF
|
||||||
- uses: actions/setup-python@v2
|
- uses: actions/setup-python@v2
|
||||||
with:
|
with:
|
||||||
python-version: 3.8
|
python-version: 3.9
|
||||||
- run: |
|
- run: |
|
||||||
python -m pip install twine
|
python -m pip install twine
|
||||||
- run: |
|
- run: |
|
||||||
|
|
|
@ -42,6 +42,6 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install .[all]
|
python3 -m pip install .[all]
|
||||||
python3 setup.py egg_info
|
python3 setup.py egg_info
|
||||||
- name: Lint check
|
# - name: Lint check
|
||||||
run: |
|
# run: |
|
||||||
make lint
|
# make lint
|
|
@ -18,7 +18,7 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
python-version: [3.9, "3.10", "3.11"]
|
||||||
experimental: [false]
|
experimental: [false]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
|
|
@ -18,7 +18,7 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
python-version: [3.9, "3.10", "3.11"]
|
||||||
experimental: [false]
|
experimental: [false]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
|
|
@ -18,7 +18,7 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
python-version: [3.9, "3.10", "3.11"]
|
||||||
experimental: [false]
|
experimental: [false]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
|
|
@ -18,7 +18,7 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
python-version: [3.9, "3.10", "3.11"]
|
||||||
experimental: [false]
|
experimental: [false]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
|
|
@ -18,7 +18,7 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
python-version: [3.9, "3.10", "3.11"]
|
||||||
experimental: [false]
|
experimental: [false]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
@ -43,6 +43,7 @@ jobs:
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
run: python3 -m pip install --upgrade pip setuptools wheel
|
||||||
- name: Replace scarf urls
|
- name: Replace scarf urls
|
||||||
run: |
|
run: |
|
||||||
|
sed -i 's/https:\/\/coqui.gateway.scarf.sh\/hf\/bark\//https:\/\/huggingface.co\/erogol\/bark\/resolve\/main\//g' TTS/.models.json
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
||||||
- name: Install TTS
|
- name: Install TTS
|
||||||
run: |
|
run: |
|
||||||
|
|
|
@ -18,7 +18,7 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
python-version: [3.9, "3.10", "3.11"]
|
||||||
experimental: [false]
|
experimental: [false]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
|
|
@ -23,7 +23,7 @@ colormap = (
|
||||||
[0, 0, 0],
|
[0, 0, 0],
|
||||||
[183, 183, 183],
|
[183, 183, 183],
|
||||||
],
|
],
|
||||||
dtype=np.float,
|
dtype=float,
|
||||||
)
|
)
|
||||||
/ 255
|
/ 255
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass, field
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
|
|
||||||
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||||
|
@ -46,11 +46,11 @@ class BarkConfig(BaseTTSConfig):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
model: str = "bark"
|
model: str = "bark"
|
||||||
audio: BarkAudioConfig = BarkAudioConfig()
|
audio: BarkAudioConfig = field(default_factory=BarkAudioConfig)
|
||||||
num_chars: int = 0
|
num_chars: int = 0
|
||||||
semantic_config: GPTConfig = GPTConfig()
|
semantic_config: GPTConfig = field(default_factory=GPTConfig)
|
||||||
fine_config: FineGPTConfig = FineGPTConfig()
|
fine_config: FineGPTConfig = field(default_factory=FineGPTConfig)
|
||||||
coarse_config: GPTConfig = GPTConfig()
|
coarse_config: GPTConfig = field(default_factory=GPTConfig)
|
||||||
CONTEXT_WINDOW_SIZE: int = 1024
|
CONTEXT_WINDOW_SIZE: int = 1024
|
||||||
SEMANTIC_RATE_HZ: float = 49.9
|
SEMANTIC_RATE_HZ: float = 49.9
|
||||||
SEMANTIC_VOCAB_SIZE: int = 10_000
|
SEMANTIC_VOCAB_SIZE: int = 10_000
|
||||||
|
|
|
@ -113,7 +113,7 @@ class FastPitchConfig(BaseTTSConfig):
|
||||||
base_model: str = "forward_tts"
|
base_model: str = "forward_tts"
|
||||||
|
|
||||||
# model specific params
|
# model specific params
|
||||||
model_args: ForwardTTSArgs = ForwardTTSArgs()
|
model_args: ForwardTTSArgs = field(default_factory=ForwardTTSArgs)
|
||||||
|
|
||||||
# multi-speaker settings
|
# multi-speaker settings
|
||||||
num_speakers: int = 0
|
num_speakers: int = 0
|
||||||
|
|
|
@ -107,7 +107,7 @@ class FastSpeechConfig(BaseTTSConfig):
|
||||||
base_model: str = "forward_tts"
|
base_model: str = "forward_tts"
|
||||||
|
|
||||||
# model specific params
|
# model specific params
|
||||||
model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False)
|
model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=False))
|
||||||
|
|
||||||
# multi-speaker settings
|
# multi-speaker settings
|
||||||
num_speakers: int = 0
|
num_speakers: int = 0
|
||||||
|
|
|
@ -123,7 +123,7 @@ class Fastspeech2Config(BaseTTSConfig):
|
||||||
base_model: str = "forward_tts"
|
base_model: str = "forward_tts"
|
||||||
|
|
||||||
# model specific params
|
# model specific params
|
||||||
model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=True, use_energy=True)
|
model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=True, use_energy=True))
|
||||||
|
|
||||||
# multi-speaker settings
|
# multi-speaker settings
|
||||||
num_speakers: int = 0
|
num_speakers: int = 0
|
||||||
|
|
|
@ -103,26 +103,28 @@ class SpeedySpeechConfig(BaseTTSConfig):
|
||||||
base_model: str = "forward_tts"
|
base_model: str = "forward_tts"
|
||||||
|
|
||||||
# set model args as SpeedySpeech
|
# set model args as SpeedySpeech
|
||||||
model_args: ForwardTTSArgs = ForwardTTSArgs(
|
model_args: ForwardTTSArgs = field(
|
||||||
use_pitch=False,
|
default_factory=lambda: ForwardTTSArgs(
|
||||||
encoder_type="residual_conv_bn",
|
use_pitch=False,
|
||||||
encoder_params={
|
encoder_type="residual_conv_bn",
|
||||||
"kernel_size": 4,
|
encoder_params={
|
||||||
"dilations": 4 * [1, 2, 4] + [1],
|
"kernel_size": 4,
|
||||||
"num_conv_blocks": 2,
|
"dilations": 4 * [1, 2, 4] + [1],
|
||||||
"num_res_blocks": 13,
|
"num_conv_blocks": 2,
|
||||||
},
|
"num_res_blocks": 13,
|
||||||
decoder_type="residual_conv_bn",
|
},
|
||||||
decoder_params={
|
decoder_type="residual_conv_bn",
|
||||||
"kernel_size": 4,
|
decoder_params={
|
||||||
"dilations": 4 * [1, 2, 4, 8] + [1],
|
"kernel_size": 4,
|
||||||
"num_conv_blocks": 2,
|
"dilations": 4 * [1, 2, 4, 8] + [1],
|
||||||
"num_res_blocks": 17,
|
"num_conv_blocks": 2,
|
||||||
},
|
"num_res_blocks": 17,
|
||||||
out_channels=80,
|
},
|
||||||
hidden_channels=128,
|
out_channels=80,
|
||||||
positional_encoding=True,
|
hidden_channels=128,
|
||||||
detach_duration_predictor=True,
|
positional_encoding=True,
|
||||||
|
detach_duration_predictor=True,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# multi-speaker settings
|
# multi-speaker settings
|
||||||
|
|
|
@ -70,7 +70,7 @@ class TortoiseConfig(BaseTTSConfig):
|
||||||
model: str = "tortoise"
|
model: str = "tortoise"
|
||||||
# model specific params
|
# model specific params
|
||||||
model_args: TortoiseArgs = field(default_factory=TortoiseArgs)
|
model_args: TortoiseArgs = field(default_factory=TortoiseArgs)
|
||||||
audio: TortoiseAudioConfig = TortoiseAudioConfig()
|
audio: TortoiseAudioConfig = field(default_factory=TortoiseAudioConfig)
|
||||||
model_dir: str = None
|
model_dir: str = None
|
||||||
|
|
||||||
# settings
|
# settings
|
||||||
|
|
|
@ -10,15 +10,11 @@ License: MIT
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import fairseq
|
|
||||||
import torch
|
import torch
|
||||||
from einops import pack, unpack
|
from einops import pack, unpack
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torchaudio.functional import resample
|
from torchaudio.functional import resample
|
||||||
|
from transformers import HubertModel
|
||||||
logging.root.setLevel(logging.ERROR)
|
|
||||||
|
|
||||||
|
|
||||||
def round_down_nearest_multiple(num, divisor):
|
def round_down_nearest_multiple(num, divisor):
|
||||||
return num // divisor * divisor
|
return num // divisor * divisor
|
||||||
|
|
||||||
|
@ -49,22 +45,11 @@ class CustomHubert(nn.Module):
|
||||||
self.target_sample_hz = target_sample_hz
|
self.target_sample_hz = target_sample_hz
|
||||||
self.seq_len_multiple_of = seq_len_multiple_of
|
self.seq_len_multiple_of = seq_len_multiple_of
|
||||||
self.output_layer = output_layer
|
self.output_layer = output_layer
|
||||||
|
|
||||||
if device is not None:
|
if device is not None:
|
||||||
self.to(device)
|
self.to(device)
|
||||||
|
self.model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
|
||||||
model_path = Path(checkpoint_path)
|
|
||||||
|
|
||||||
assert model_path.exists(), f"path {checkpoint_path} does not exist"
|
|
||||||
|
|
||||||
checkpoint = torch.load(checkpoint_path)
|
|
||||||
load_model_input = {checkpoint_path: checkpoint}
|
|
||||||
model, *_ = fairseq.checkpoint_utils.load_model_ensemble_and_task(load_model_input)
|
|
||||||
|
|
||||||
if device is not None:
|
if device is not None:
|
||||||
model[0].to(device)
|
self.model.to(device)
|
||||||
|
|
||||||
self.model = model[0]
|
|
||||||
self.model.eval()
|
self.model.eval()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -81,19 +66,13 @@ class CustomHubert(nn.Module):
|
||||||
if exists(self.seq_len_multiple_of):
|
if exists(self.seq_len_multiple_of):
|
||||||
wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of)
|
wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of)
|
||||||
|
|
||||||
embed = self.model(
|
outputs = self.model.forward(
|
||||||
wav_input,
|
wav_input,
|
||||||
features_only=True,
|
output_hidden_states=True,
|
||||||
mask=False, # thanks to @maitycyrus for noticing that mask is defaulted to True in the fairseq code
|
|
||||||
output_layer=self.output_layer,
|
|
||||||
)
|
)
|
||||||
|
embed = outputs["hidden_states"][self.output_layer]
|
||||||
embed, packed_shape = pack([embed["x"]], "* d")
|
embed, packed_shape = pack([embed], "* d")
|
||||||
|
codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device)
|
||||||
# codebook_indices = self.kmeans.predict(embed.cpu().detach().numpy())
|
|
||||||
|
|
||||||
codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device) # .long()
|
|
||||||
|
|
||||||
if flatten:
|
if flatten:
|
||||||
return codebook_indices
|
return codebook_indices
|
||||||
|
|
||||||
|
|
|
@ -130,7 +130,7 @@ def generate_voice(
|
||||||
# generate semantic tokens
|
# generate semantic tokens
|
||||||
# Load the HuBERT model
|
# Load the HuBERT model
|
||||||
hubert_manager = HubertManager()
|
hubert_manager = HubertManager()
|
||||||
hubert_manager.make_sure_hubert_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert"])
|
# hubert_manager.make_sure_hubert_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert"])
|
||||||
hubert_manager.make_sure_tokenizer_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert_tokenizer"])
|
hubert_manager.make_sure_tokenizer_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert_tokenizer"])
|
||||||
|
|
||||||
hubert_model = CustomHubert(checkpoint_path=model.config.LOCAL_MODEL_PATHS["hubert"]).to(model.device)
|
hubert_model = CustomHubert(checkpoint_path=model.config.LOCAL_MODEL_PATHS["hubert"]).to(model.device)
|
||||||
|
|
|
@ -165,7 +165,7 @@ class BCELossMasked(nn.Module):
|
||||||
|
|
||||||
def __init__(self, pos_weight: float = None):
|
def __init__(self, pos_weight: float = None):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.pos_weight = nn.Parameter(torch.tensor([pos_weight]), requires_grad=False)
|
self.register_buffer("pos_weight", torch.tensor([pos_weight]))
|
||||||
|
|
||||||
def forward(self, x, target, length):
|
def forward(self, x, target, length):
|
||||||
"""
|
"""
|
||||||
|
@ -191,10 +191,15 @@ class BCELossMasked(nn.Module):
|
||||||
mask = sequence_mask(sequence_length=length, max_len=target.size(1))
|
mask = sequence_mask(sequence_length=length, max_len=target.size(1))
|
||||||
num_items = mask.sum()
|
num_items = mask.sum()
|
||||||
loss = functional.binary_cross_entropy_with_logits(
|
loss = functional.binary_cross_entropy_with_logits(
|
||||||
x.masked_select(mask), target.masked_select(mask), pos_weight=self.pos_weight, reduction="sum"
|
x.masked_select(mask),
|
||||||
|
target.masked_select(mask),
|
||||||
|
pos_weight=self.pos_weight.to(x.device),
|
||||||
|
reduction="sum",
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
loss = functional.binary_cross_entropy_with_logits(x, target, pos_weight=self.pos_weight, reduction="sum")
|
loss = functional.binary_cross_entropy_with_logits(
|
||||||
|
x, target, pos_weight=self.pos_weight.to(x.device), reduction="sum"
|
||||||
|
)
|
||||||
num_items = torch.numel(x)
|
num_items = torch.numel(x)
|
||||||
loss = loss / num_items
|
loss = loss / num_items
|
||||||
return loss
|
return loss
|
||||||
|
|
|
@ -207,7 +207,7 @@ def maximum_path_numpy(value, mask, max_neg_val=None):
|
||||||
device = value.device
|
device = value.device
|
||||||
dtype = value.dtype
|
dtype = value.dtype
|
||||||
value = value.cpu().detach().numpy()
|
value = value.cpu().detach().numpy()
|
||||||
mask = mask.cpu().detach().numpy().astype(np.bool)
|
mask = mask.cpu().detach().numpy().astype(bool)
|
||||||
|
|
||||||
b, t_x, t_y = value.shape
|
b, t_x, t_y = value.shape
|
||||||
direction = np.zeros(value.shape, dtype=np.int64)
|
direction = np.zeros(value.shape, dtype=np.int64)
|
||||||
|
|
|
@ -540,7 +540,10 @@ class AudioProcessor(object):
|
||||||
|
|
||||||
def _griffin_lim(self, S):
|
def _griffin_lim(self, S):
|
||||||
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
||||||
S_complex = np.abs(S).astype(np.complex)
|
try:
|
||||||
|
S_complex = np.abs(S).astype(np.complex)
|
||||||
|
except AttributeError: # np.complex is deprecated since numpy 1.20.0
|
||||||
|
S_complex = np.abs(S).astype(complex)
|
||||||
y = self._istft(S_complex * angles)
|
y = self._istft(S_complex * angles)
|
||||||
if not np.isfinite(y).all():
|
if not np.isfinite(y).all():
|
||||||
print(" [!] Waveform is not finite everywhere. Skipping the GL.")
|
print(" [!] Waveform is not finite everywhere. Skipping the GL.")
|
||||||
|
|
|
@ -264,14 +264,17 @@ class ModelManager(object):
|
||||||
model_download_uri = os.path.join(URI_PREFIX, f"{lang}.tar.gz")
|
model_download_uri = os.path.join(URI_PREFIX, f"{lang}.tar.gz")
|
||||||
self._download_tar_file(model_download_uri, output_path, self.progress_bar)
|
self._download_tar_file(model_download_uri, output_path, self.progress_bar)
|
||||||
|
|
||||||
def set_model_url(self, model_item: Dict):
|
@staticmethod
|
||||||
|
def set_model_url(model_item: Dict):
|
||||||
model_item["model_url"] = None
|
model_item["model_url"] = None
|
||||||
if "github_rls_url" in model_item:
|
if "github_rls_url" in model_item:
|
||||||
model_item["model_url"] = model_item["github_rls_url"]
|
model_item["model_url"] = model_item["github_rls_url"]
|
||||||
elif "hf_url" in model_item:
|
elif "hf_url" in model_item:
|
||||||
model_item["model_url"] = model_item["hf_url"]
|
model_item["model_url"] = model_item["hf_url"]
|
||||||
|
elif "fairseq" in model_item["model_name"]:
|
||||||
|
model_item["model_url"] = "https://coqui.gateway.scarf.sh/fairseq/"
|
||||||
return model_item
|
return model_item
|
||||||
|
|
||||||
def _set_model_item(self, model_name):
|
def _set_model_item(self, model_name):
|
||||||
# fetch model info from the dict
|
# fetch model info from the dict
|
||||||
model_type, lang, dataset, model = model_name.split("/")
|
model_type, lang, dataset, model = model_name.split("/")
|
||||||
|
@ -285,10 +288,12 @@ class ModelManager(object):
|
||||||
"author": "fairseq",
|
"author": "fairseq",
|
||||||
"description": "this model is released by Meta under Fairseq repo. Visit https://github.com/facebookresearch/fairseq/tree/main/examples/mms for more info.",
|
"description": "this model is released by Meta under Fairseq repo. Visit https://github.com/facebookresearch/fairseq/tree/main/examples/mms for more info.",
|
||||||
}
|
}
|
||||||
|
model_item["model_name"] = model_name
|
||||||
else:
|
else:
|
||||||
# get model from models.json
|
# get model from models.json
|
||||||
model_item = self.models_dict[model_type][lang][dataset][model]
|
model_item = self.models_dict[model_type][lang][dataset][model]
|
||||||
model_item["model_type"] = model_type
|
model_item["model_type"] = model_type
|
||||||
|
model_item = self.set_model_url(model_item)
|
||||||
return model_item, model_full_name, model
|
return model_item, model_full_name, model
|
||||||
|
|
||||||
def download_model(self, model_name):
|
def download_model(self, model_name):
|
||||||
|
@ -324,7 +329,9 @@ class ModelManager(object):
|
||||||
# find downloaded files
|
# find downloaded files
|
||||||
output_model_path = output_path
|
output_model_path = output_path
|
||||||
output_config_path = None
|
output_config_path = None
|
||||||
if model not in ["tortoise-v2", "bark"] and "fairseq" not in model_name: # TODO:This is stupid but don't care for now.
|
if (
|
||||||
|
model not in ["tortoise-v2", "bark"] and "fairseq" not in model_name
|
||||||
|
): # TODO:This is stupid but don't care for now.
|
||||||
output_model_path, output_config_path = self._find_files(output_path)
|
output_model_path, output_config_path = self._find_files(output_path)
|
||||||
# update paths in the config.json
|
# update paths in the config.json
|
||||||
self._update_paths(output_path, output_config_path)
|
self._update_paths(output_path, output_config_path)
|
||||||
|
|
|
@ -794,8 +794,8 @@ class FreeVCConfig(BaseVCConfig):
|
||||||
|
|
||||||
model: str = "freevc"
|
model: str = "freevc"
|
||||||
# model specific params
|
# model specific params
|
||||||
model_args: FreeVCArgs = FreeVCArgs()
|
model_args: FreeVCArgs = field(default_factory=FreeVCArgs)
|
||||||
audio: FreeVCAudioConfig = FreeVCAudioConfig()
|
audio: FreeVCAudioConfig = field(default_factory=FreeVCAudioConfig)
|
||||||
|
|
||||||
# optimizer
|
# optimizer
|
||||||
# TODO with training support
|
# TODO with training support
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["setuptools", "wheel", "cython==0.29.28", "numpy==1.21.6", "packaging"]
|
requires = ["setuptools", "wheel", "cython==0.29.30", "numpy==1.22.0", "packaging"]
|
||||||
|
|
||||||
[flake8]
|
[flake8]
|
||||||
max-line-length=120
|
max-line-length=120
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
# core deps
|
# core deps
|
||||||
numpy==1.21.6;python_version<"3.10"
|
numpy==1.22.0;python_version<="3.10"
|
||||||
numpy;python_version=="3.10"
|
numpy==1.24.3;python_version>"3.10"
|
||||||
cython==0.29.28
|
cython==0.29.30
|
||||||
scipy>=1.4.0
|
scipy>=1.4.0
|
||||||
torch>=1.7
|
torch>=1.7
|
||||||
torchaudio
|
torchaudio
|
||||||
soundfile
|
soundfile
|
||||||
librosa==0.10.0.*
|
librosa==0.10.0.*
|
||||||
numba==0.55.1;python_version<"3.9"
|
numba==0.55.1;python_version<"3.9"
|
||||||
numba==0.56.4;python_version>="3.9"
|
numba==0.57.0;python_version>="3.9"
|
||||||
inflect==5.6.0
|
inflect==5.6.0
|
||||||
tqdm
|
tqdm
|
||||||
anyascii
|
anyascii
|
||||||
|
@ -26,14 +26,14 @@ pandas
|
||||||
# deps for training
|
# deps for training
|
||||||
matplotlib
|
matplotlib
|
||||||
# coqui stack
|
# coqui stack
|
||||||
trainer==0.0.20
|
trainer
|
||||||
# config management
|
# config management
|
||||||
coqpit>=0.0.16
|
coqpit>=0.0.16
|
||||||
# chinese g2p deps
|
# chinese g2p deps
|
||||||
jieba
|
jieba
|
||||||
pypinyin
|
pypinyin
|
||||||
# japanese g2p deps
|
# japanese g2p deps
|
||||||
mecab-python3==1.0.5
|
mecab-python3==1.0.6
|
||||||
unidic-lite==1.0.8
|
unidic-lite==1.0.8
|
||||||
# gruut+supported langs
|
# gruut+supported langs
|
||||||
gruut[de,es,fr]==2.2.3
|
gruut[de,es,fr]==2.2.3
|
||||||
|
@ -51,5 +51,3 @@ einops
|
||||||
transformers
|
transformers
|
||||||
#deps for bark
|
#deps for bark
|
||||||
encodec
|
encodec
|
||||||
#deps for fairseq models
|
|
||||||
fairseq
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
[build_py]
|
[build_py]
|
||||||
build-lib=temp_build
|
build_lib=temp_build
|
||||||
|
|
||||||
[bdist_wheel]
|
[bdist_wheel]
|
||||||
bdist-dir=temp_build
|
bdist_dir=temp_build
|
||||||
|
|
||||||
[install_lib]
|
[install_lib]
|
||||||
build-dir=temp_build
|
build_dir=temp_build
|
||||||
|
|
9
setup.py
9
setup.py
|
@ -32,8 +32,8 @@ from Cython.Build import cythonize
|
||||||
from setuptools import Extension, find_packages, setup
|
from setuptools import Extension, find_packages, setup
|
||||||
|
|
||||||
python_version = sys.version.split()[0]
|
python_version = sys.version.split()[0]
|
||||||
if Version(python_version) < Version("3.7") or Version(python_version) >= Version("3.11"):
|
if Version(python_version) < Version("3.9") or Version(python_version) >= Version("3.12"):
|
||||||
raise RuntimeError("TTS requires python >= 3.7 and < 3.11 " "but your Python version is {}".format(sys.version))
|
raise RuntimeError("TTS requires python >= 3.9 and < 3.12 " "but your Python version is {}".format(sys.version))
|
||||||
|
|
||||||
|
|
||||||
cwd = os.path.dirname(os.path.abspath(__file__))
|
cwd = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
@ -114,15 +114,14 @@ setup(
|
||||||
"dev": requirements_dev,
|
"dev": requirements_dev,
|
||||||
"notebooks": requirements_notebooks,
|
"notebooks": requirements_notebooks,
|
||||||
},
|
},
|
||||||
python_requires=">=3.7.0, <3.11",
|
python_requires=">=3.9.0, <3.12",
|
||||||
entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]},
|
entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]},
|
||||||
classifiers=[
|
classifiers=[
|
||||||
"Programming Language :: Python",
|
"Programming Language :: Python",
|
||||||
"Programming Language :: Python :: 3",
|
"Programming Language :: Python :: 3",
|
||||||
"Programming Language :: Python :: 3.7",
|
|
||||||
"Programming Language :: Python :: 3.8",
|
|
||||||
"Programming Language :: Python :: 3.9",
|
"Programming Language :: Python :: 3.9",
|
||||||
"Programming Language :: Python :: 3.10",
|
"Programming Language :: Python :: 3.10",
|
||||||
|
"Programming Language :: Python :: 3.11",
|
||||||
"Development Status :: 3 - Alpha",
|
"Development Status :: 3 - Alpha",
|
||||||
"Intended Audience :: Science/Research",
|
"Intended Audience :: Science/Research",
|
||||||
"Intended Audience :: Developers",
|
"Intended Audience :: Developers",
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import unittest
|
import unittest
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
from coqpit import Coqpit
|
from coqpit import Coqpit
|
||||||
|
|
||||||
|
@ -86,11 +86,11 @@ class TestTTSTokenizer(unittest.TestCase):
|
||||||
enable_eos_bos_chars: bool = True
|
enable_eos_bos_chars: bool = True
|
||||||
use_phonemes: bool = True
|
use_phonemes: bool = True
|
||||||
add_blank: bool = False
|
add_blank: bool = False
|
||||||
characters: str = Characters()
|
characters: str = field(default_factory=Characters)
|
||||||
phonemizer: str = "espeak"
|
phonemizer: str = "espeak"
|
||||||
phoneme_language: str = "tr"
|
phoneme_language: str = "tr"
|
||||||
text_cleaner: str = "phoneme_cleaners"
|
text_cleaner: str = "phoneme_cleaners"
|
||||||
characters = Characters()
|
characters = field(default_factory=Characters)
|
||||||
|
|
||||||
tokenizer_ph, _ = TTSTokenizer.init_from_config(TokenizerConfig())
|
tokenizer_ph, _ = TTSTokenizer.init_from_config(TokenizerConfig())
|
||||||
tokenizer_ph.phonemizer.backend = "espeak"
|
tokenizer_ph.phonemizer.backend = "espeak"
|
||||||
|
|
|
@ -16,7 +16,7 @@ from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
torch.manual_seed(1)
|
torch.manual_seed(1)
|
||||||
use_cuda = torch.cuda.is_available()
|
use_cuda = torch.cuda.is_available()
|
||||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
device = torch.device("cuda" if use_cuda else "cpu")
|
||||||
|
|
||||||
config_global = TacotronConfig(num_chars=32, num_speakers=5, out_channels=513, decoder_output_dim=80)
|
config_global = TacotronConfig(num_chars=32, num_speakers=5, out_channels=513, decoder_output_dim=80)
|
||||||
|
|
||||||
|
@ -288,7 +288,6 @@ class TacotronCapacitronTrainTest(unittest.TestCase):
|
||||||
batch["text_input"].shape[0], batch["stop_targets"].size(1) // config.r, -1
|
batch["text_input"].shape[0], batch["stop_targets"].size(1) // config.r, -1
|
||||||
)
|
)
|
||||||
batch["stop_targets"] = (batch["stop_targets"].sum(2) > 0.0).unsqueeze(2).float().squeeze()
|
batch["stop_targets"] = (batch["stop_targets"].sum(2) > 0.0).unsqueeze(2).float().squeeze()
|
||||||
|
|
||||||
model = Tacotron(config).to(device)
|
model = Tacotron(config).to(device)
|
||||||
criterion = model.get_criterion()
|
criterion = model.get_criterion()
|
||||||
optimizer = model.get_optimizer()
|
optimizer = model.get_optimizer()
|
||||||
|
|
|
@ -15,7 +15,7 @@ def run_models(offset=0, step=1):
|
||||||
print(" > Run synthesizer with all the models.")
|
print(" > Run synthesizer with all the models.")
|
||||||
output_path = os.path.join(get_tests_output_path(), "output.wav")
|
output_path = os.path.join(get_tests_output_path(), "output.wav")
|
||||||
manager = ModelManager(output_prefix=get_tests_output_path(), progress_bar=False)
|
manager = ModelManager(output_prefix=get_tests_output_path(), progress_bar=False)
|
||||||
model_names = manager.list_models()
|
model_names = [name for name in manager.list_models() if "bark" not in name]
|
||||||
for model_name in model_names[offset::step]:
|
for model_name in model_names[offset::step]:
|
||||||
print(f"\n > Run - {model_name}")
|
print(f"\n > Run - {model_name}")
|
||||||
model_path, _, _ = manager.download_model(model_name)
|
model_path, _, _ = manager.download_model(model_name)
|
||||||
|
@ -79,6 +79,15 @@ def test_models_offset_2_step_3():
|
||||||
run_models(offset=2, step=3)
|
run_models(offset=2, step=3)
|
||||||
|
|
||||||
|
|
||||||
|
def test_bark():
|
||||||
|
"""Bark is too big to run on github actions. We need to test it locally"""
|
||||||
|
output_path = os.path.join(get_tests_output_path(), "output.wav")
|
||||||
|
run_cli(
|
||||||
|
f" tts --model_name tts_models/multilingual/multi-dataset/bark "
|
||||||
|
f'--text "This is an example." --out_path "{output_path}" --progress_bar False'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_voice_conversion():
|
def test_voice_conversion():
|
||||||
print(" > Run voice conversion inference using YourTTS model.")
|
print(" > Run voice conversion inference using YourTTS model.")
|
||||||
model_name = "tts_models/multilingual/multi-dataset/your_tts"
|
model_name = "tts_models/multilingual/multi-dataset/your_tts"
|
||||||
|
|
Loading…
Reference in New Issue