mirror of https://github.com/coqui-ai/TTS.git
Compare commits
9 Commits
Author | SHA1 | Date |
---|---|---|
|
dbf1a08a0d | |
|
5dcc16d193 | |
|
55c7063724 | |
|
99fee6f5ad | |
|
186cafb34c | |
|
3991d83b2c | |
|
0859e9f252 | |
|
9f325b1f6c | |
|
fc099218df |
|
@ -7,9 +7,6 @@
|
|||
- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
|
||||
- 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
|
||||
- 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://tts.readthedocs.io/en/dev/models/tortoise.html)
|
||||
- 📣 Voice generation with prompts - **Prompt to Voice** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin)!! - [Blog Post](https://coqui.ai/blog/tts/prompt-to-voice)
|
||||
- 📣 Voice generation with fusion - **Voice fusion** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin).
|
||||
- 📣 Voice cloning is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin).
|
||||
|
||||
<div align="center">
|
||||
<img src="https://static.scarf.sh/a.png?x-pxid=cf317fe7-2188-4721-bc01-124bb5d5dbb2" />
|
||||
|
|
|
@ -125,7 +125,7 @@ def evaluation(model, criterion, data_loader, global_step):
|
|||
|
||||
def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
|
||||
model.train()
|
||||
best_loss = float("inf")
|
||||
best_loss = {"train_loss": None, "eval_loss": float("inf")}
|
||||
avg_loader_time = 0
|
||||
end_time = time.time()
|
||||
for epoch in range(c.epochs):
|
||||
|
@ -248,7 +248,7 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
|
|||
)
|
||||
# save the best checkpoint
|
||||
best_loss = save_best_model(
|
||||
eval_loss,
|
||||
{"train_loss": None, "eval_loss": eval_loss},
|
||||
best_loss,
|
||||
c,
|
||||
model,
|
||||
|
|
|
@ -13,6 +13,8 @@ from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor
|
|||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.numpy_transforms import compute_energy as calculate_energy
|
||||
|
||||
import mutagen
|
||||
|
||||
# to prevent too many open files error as suggested here
|
||||
# https://github.com/pytorch/pytorch/issues/11201#issuecomment-421146936
|
||||
torch.multiprocessing.set_sharing_strategy("file_system")
|
||||
|
@ -42,6 +44,15 @@ def string2filename(string):
|
|||
return filename
|
||||
|
||||
|
||||
def get_audio_size(audiopath):
|
||||
extension = audiopath.rpartition(".")[-1].lower()
|
||||
if extension not in {"mp3", "wav", "flac"}:
|
||||
raise RuntimeError(f"The audio format {extension} is not supported, please convert the audio files to mp3, flac, or wav format!")
|
||||
|
||||
audio_info = mutagen.File(audiopath).info
|
||||
return int(audio_info.length * audio_info.sample_rate)
|
||||
|
||||
|
||||
class TTSDataset(Dataset):
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -176,7 +187,7 @@ class TTSDataset(Dataset):
|
|||
lens = []
|
||||
for item in self.samples:
|
||||
_, wav_file, *_ = _parse_sample(item)
|
||||
audio_len = os.path.getsize(wav_file) / 16 * 8 # assuming 16bit audio
|
||||
audio_len = get_audio_size(wav_file)
|
||||
lens.append(audio_len)
|
||||
return lens
|
||||
|
||||
|
@ -295,7 +306,7 @@ class TTSDataset(Dataset):
|
|||
def _compute_lengths(samples):
|
||||
new_samples = []
|
||||
for item in samples:
|
||||
audio_length = os.path.getsize(item["audio_file"]) / 16 * 8 # assuming 16bit audio
|
||||
audio_length = get_audio_size(item["audio_file"])
|
||||
text_lenght = len(item["text"])
|
||||
item["audio_length"] = audio_length
|
||||
item["text_length"] = text_lenght
|
||||
|
|
|
@ -756,11 +756,13 @@ class Xtts(BaseTTS):
|
|||
|
||||
model_path = checkpoint_path or os.path.join(checkpoint_dir, "model.pth")
|
||||
vocab_path = vocab_path or os.path.join(checkpoint_dir, "vocab.json")
|
||||
speaker_file_path = speaker_file_path or os.path.join(checkpoint_dir, "speakers_xtts.pth")
|
||||
|
||||
if speaker_file_path is None and checkpoint_dir is not None:
|
||||
speaker_file_path = os.path.join(checkpoint_dir, "speakers_xtts.pth")
|
||||
|
||||
self.language_manager = LanguageManager(config)
|
||||
self.speaker_manager = None
|
||||
if os.path.exists(speaker_file_path):
|
||||
if speaker_file_path is not None and os.path.exists(speaker_file_path):
|
||||
self.speaker_manager = SpeakerManager(speaker_file_path)
|
||||
|
||||
if os.path.exists(vocab_path):
|
||||
|
|
|
@ -36,9 +36,7 @@ def get_git_branch():
|
|||
current.replace("* ", "")
|
||||
except subprocess.CalledProcessError:
|
||||
current = "inside_docker"
|
||||
except FileNotFoundError:
|
||||
current = "unknown"
|
||||
except StopIteration:
|
||||
except (FileNotFoundError, StopIteration) as e:
|
||||
current = "unknown"
|
||||
return current
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@ pyyaml>=6.0
|
|||
fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail
|
||||
aiohttp>=3.8.1
|
||||
packaging>=23.1
|
||||
mutagen==1.47.0
|
||||
# deps for examples
|
||||
flask>=2.0.1
|
||||
# deps for inference
|
||||
|
@ -27,7 +28,7 @@ pandas>=1.4,<2.0
|
|||
# deps for training
|
||||
matplotlib>=3.7.0
|
||||
# coqui stack
|
||||
trainer>=0.0.32
|
||||
trainer>=0.0.36
|
||||
# config management
|
||||
coqpit>=0.0.16
|
||||
# chinese g2p deps
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
audio_file|text|transcription|speaker_name
|
||||
wavs/LJ001-0001.flac|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0
|
||||
wavs/LJ001-0002.flac|in being comparatively modern.|in being comparatively modern.|ljspeech-0
|
||||
wavs/LJ001-0003.flac|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1
|
||||
wavs/LJ001-0004.flac|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1
|
||||
wavs/LJ001-0005.flac|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2
|
||||
wavs/LJ001-0006.flac|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2
|
||||
wavs/LJ001-0007.flac|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3
|
||||
wavs/LJ001-0008.flac|has never been surpassed.|has never been surpassed.|ljspeech-3
|
Can't render this file because it contains an unexpected character in line 8 and column 86.
|
|
@ -0,0 +1,9 @@
|
|||
audio_file|text|transcription|speaker_name
|
||||
wavs/LJ001-0001.mp3|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0
|
||||
wavs/LJ001-0002.mp3|in being comparatively modern.|in being comparatively modern.|ljspeech-0
|
||||
wavs/LJ001-0003.mp3|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1
|
||||
wavs/LJ001-0004.mp3|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1
|
||||
wavs/LJ001-0005.mp3|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2
|
||||
wavs/LJ001-0006.mp3|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2
|
||||
wavs/LJ001-0007.mp3|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3
|
||||
wavs/LJ001-0008.mp3|has never been surpassed.|has never been surpassed.|ljspeech-3
|
Can't render this file because it contains an unexpected character in line 8 and column 85.
|
|
@ -0,0 +1,9 @@
|
|||
audio_file|text|transcription|speaker_name
|
||||
wavs/LJ001-0001.wav|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0
|
||||
wavs/LJ001-0002.wav|in being comparatively modern.|in being comparatively modern.|ljspeech-0
|
||||
wavs/LJ001-0003.wav|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1
|
||||
wavs/LJ001-0004.wav|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1
|
||||
wavs/LJ001-0005.wav|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2
|
||||
wavs/LJ001-0006.wav|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2
|
||||
wavs/LJ001-0007.wav|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3
|
||||
wavs/LJ001-0008.wav|has never been surpassed.|has never been surpassed.|ljspeech-3
|
Can't render this file because it contains an unexpected character in line 8 and column 85.
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -21,15 +21,30 @@ os.makedirs(OUTPATH, exist_ok=True)
|
|||
c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False)
|
||||
c.r = 5
|
||||
c.data_path = os.path.join(get_tests_data_path(), "ljspeech/")
|
||||
ok_ljspeech = os.path.exists(c.data_path)
|
||||
|
||||
dataset_config = BaseDatasetConfig(
|
||||
formatter="ljspeech_test", # ljspeech_test to multi-speaker
|
||||
meta_file_train="metadata.csv",
|
||||
dataset_config_wav = BaseDatasetConfig(
|
||||
formatter="coqui", # ljspeech_test to multi-speaker
|
||||
meta_file_train="metadata_wav.csv",
|
||||
meta_file_val=None,
|
||||
path=c.data_path,
|
||||
language="en",
|
||||
)
|
||||
dataset_config_mp3 = BaseDatasetConfig(
|
||||
formatter="coqui", # ljspeech_test to multi-speaker
|
||||
meta_file_train="metadata_mp3.csv",
|
||||
meta_file_val=None,
|
||||
path=c.data_path,
|
||||
language="en",
|
||||
)
|
||||
dataset_config_flac = BaseDatasetConfig(
|
||||
formatter="coqui", # ljspeech_test to multi-speaker
|
||||
meta_file_train="metadata_flac.csv",
|
||||
meta_file_val=None,
|
||||
path=c.data_path,
|
||||
language="en",
|
||||
)
|
||||
|
||||
dataset_configs = [dataset_config_wav, dataset_config_mp3, dataset_config_flac]
|
||||
|
||||
DATA_EXIST = True
|
||||
if not os.path.exists(c.data_path):
|
||||
|
@ -44,11 +59,10 @@ class TestTTSDataset(unittest.TestCase):
|
|||
self.max_loader_iter = 4
|
||||
self.ap = AudioProcessor(**c.audio)
|
||||
|
||||
def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False):
|
||||
def _create_dataloader(self, batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False):
|
||||
# load dataset
|
||||
meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2)
|
||||
items = meta_data_train + meta_data_eval
|
||||
|
||||
tokenizer, _ = TTSTokenizer.init_from_config(c)
|
||||
dataset = TTSDataset(
|
||||
outputs_per_step=r,
|
||||
|
@ -64,6 +78,11 @@ class TestTTSDataset(unittest.TestCase):
|
|||
max_audio_len=c.max_audio_len,
|
||||
start_by_longest=start_by_longest,
|
||||
)
|
||||
|
||||
# add preprocess to force the length computation
|
||||
if preprocess_samples:
|
||||
dataset.preprocess_samples()
|
||||
|
||||
dataloader = DataLoader(
|
||||
dataset,
|
||||
batch_size=batch_size,
|
||||
|
@ -75,9 +94,8 @@ class TestTTSDataset(unittest.TestCase):
|
|||
return dataloader, dataset
|
||||
|
||||
def test_loader(self):
|
||||
if ok_ljspeech:
|
||||
dataloader, dataset = self._create_dataloader(1, 1, 0)
|
||||
|
||||
for dataset_config in dataset_configs:
|
||||
dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config, preprocess_samples=True)
|
||||
for i, data in enumerate(dataloader):
|
||||
if i == self.max_loader_iter:
|
||||
break
|
||||
|
@ -104,8 +122,6 @@ class TestTTSDataset(unittest.TestCase):
|
|||
|
||||
# make sure that the computed mels and the waveform match and correctly computed
|
||||
mel_new = self.ap.melspectrogram(wavs[0].squeeze().numpy())
|
||||
# remove padding in mel-spectrogram
|
||||
mel_dataloader = mel_input[0].T.numpy()[:, : mel_lengths[0]]
|
||||
# guarantee that both mel-spectrograms have the same size and that we will remove waveform padding
|
||||
mel_new = mel_new[:, : mel_lengths[0]]
|
||||
ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length)
|
||||
|
@ -124,8 +140,7 @@ class TestTTSDataset(unittest.TestCase):
|
|||
self.assertGreaterEqual(mel_input.min(), 0)
|
||||
|
||||
def test_batch_group_shuffle(self):
|
||||
if ok_ljspeech:
|
||||
dataloader, dataset = self._create_dataloader(2, c.r, 16)
|
||||
dataloader, dataset = self._create_dataloader(2, c.r, 16, dataset_config_wav)
|
||||
last_length = 0
|
||||
frames = dataset.samples
|
||||
for i, data in enumerate(dataloader):
|
||||
|
@ -147,8 +162,7 @@ class TestTTSDataset(unittest.TestCase):
|
|||
|
||||
Ther first item of the fist batch must be longer than all the other items.
|
||||
"""
|
||||
if ok_ljspeech:
|
||||
dataloader, _ = self._create_dataloader(2, c.r, 0, True)
|
||||
dataloader, _ = self._create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True)
|
||||
dataloader.dataset.preprocess_samples()
|
||||
for i, data in enumerate(dataloader):
|
||||
if i == self.max_loader_iter:
|
||||
|
@ -172,8 +186,7 @@ class TestTTSDataset(unittest.TestCase):
|
|||
self.assertEqual(mel_lengths[idx], linear_input[idx].shape[0])
|
||||
self.assertEqual(mel_lengths[idx], mel_input[idx].shape[0])
|
||||
|
||||
if ok_ljspeech:
|
||||
dataloader, _ = self._create_dataloader(1, 1, 0)
|
||||
dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config_wav)
|
||||
|
||||
for i, data in enumerate(dataloader):
|
||||
if i == self.max_loader_iter:
|
||||
|
@ -210,7 +223,7 @@ class TestTTSDataset(unittest.TestCase):
|
|||
check_conditions(0, linear_input, mel_input, stop_target, mel_lengths)
|
||||
|
||||
# Test for batch size 2
|
||||
dataloader, _ = self._create_dataloader(2, 1, 0)
|
||||
dataloader, _ = self._create_dataloader(2, 1, 0, dataset_config_wav)
|
||||
|
||||
for i, data in enumerate(dataloader):
|
||||
if i == self.max_loader_iter:
|
||||
|
|
Loading…
Reference in New Issue