Update ljspeech recipes

This commit is contained in:
Eren Gölge 2021-12-07 12:58:41 +00:00
parent 38a0b3b6c7
commit 6d9879bf66
9 changed files with 155 additions and 80 deletions

View File

@ -1,9 +1,11 @@
import os import os
from TTS.trainer import Trainer, TrainingArgs from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.configs.align_tts_config import AlignTTSConfig, BaseDatasetConfig from TTS.tts.configs.align_tts_config import AlignTTSConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.align_tts import AlignTTS from TTS.tts.models.align_tts import AlignTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.dirname(os.path.abspath(__file__))
@ -31,23 +33,32 @@ config = AlignTTSConfig(
datasets=[dataset_config], datasets=[dataset_config],
) )
# init audio processor # INITIALIZE THE AUDIO PROCESSOR
ap = AudioProcessor(**config.audio.to_dict()) # Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
# load training samples # INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
# init model # init model
model = AlignTTS(config) model = AlignTTS(config, ap, tokenizer)
# init the trainer and 🚀 # INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer( trainer = Trainer(
TrainingArgs(), TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
training_assets={"audio_processor": ap},
) )
# AND... 3,2,1... 🚀
trainer.fit() trainer.fit()

View File

@ -5,6 +5,7 @@ from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.configs.fast_pitch_config import FastPitchConfig from TTS.tts.configs.fast_pitch_config import FastPitchConfig
from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.forward_tts import ForwardTTS from TTS.tts.models.forward_tts import ForwardTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
from TTS.utils.manage import ModelManager from TTS.utils.manage import ModelManager
@ -46,9 +47,9 @@ config = FastPitchConfig(
epochs=1000, epochs=1000,
text_cleaner="english_cleaners", text_cleaner="english_cleaners",
use_phonemes=True, use_phonemes=True,
use_espeak_phonemes=False,
phoneme_language="en-us", phoneme_language="en-us",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
precompute_num_workers=4,
print_step=50, print_step=50,
print_eval=False, print_eval=False,
mixed_precision=False, mixed_precision=False,
@ -67,23 +68,28 @@ if not config.model_args.use_aligner:
f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true" f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true"
) )
# init audio processor # INITIALIZE THE AUDIO PROCESSOR
ap = AudioProcessor(**config.audio) # Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
# load training samples # INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
# init the model # init the model
model = ForwardTTS(config) model = ForwardTTS(config, ap, tokenizer, speaker_manager=None)
# init the trainer and 🚀 # init the trainer and 🚀
trainer = Trainer( trainer = Trainer(
TrainingArgs(), TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
training_assets={"audio_processor": ap},
) )
trainer.fit() trainer.fit()

View File

@ -5,6 +5,7 @@ from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.configs.fast_speech_config import FastSpeechConfig from TTS.tts.configs.fast_speech_config import FastSpeechConfig
from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.forward_tts import ForwardTTS from TTS.tts.models.forward_tts import ForwardTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
from TTS.utils.manage import ModelManager from TTS.utils.manage import ModelManager
@ -45,9 +46,9 @@ config = FastSpeechConfig(
epochs=1000, epochs=1000,
text_cleaner="english_cleaners", text_cleaner="english_cleaners",
use_phonemes=True, use_phonemes=True,
use_espeak_phonemes=False,
phoneme_language="en-us", phoneme_language="en-us",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
precompute_num_workers=8,
print_step=50, print_step=50,
print_eval=False, print_eval=False,
mixed_precision=False, mixed_precision=False,
@ -66,23 +67,28 @@ if not config.model_args.use_aligner:
f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true" f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true"
) )
# init audio processor # INITIALIZE THE AUDIO PROCESSOR
ap = AudioProcessor(**config.audio) # Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
# load training samples # INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
# init the model # init the model
model = ForwardTTS(config) model = ForwardTTS(config, ap, tokenizer)
# init the trainer and 🚀 # init the trainer and 🚀
trainer = Trainer( trainer = Trainer(
TrainingArgs(), TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
training_assets={"audio_processor": ap},
) )
trainer.fit() trainer.fit()

View File

@ -52,7 +52,8 @@ ap = AudioProcessor.init_from_config(config)
# INITIALIZE THE TOKENIZER # INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs. # Tokenizer is used to convert text to sequences of token IDs.
tokenizer = TTSTokenizer.init_from_config(config) # If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES # LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]``` # Each sample is a list of ```[text, audio_file_path, speaker_name]```

View File

@ -5,6 +5,7 @@ from TTS.trainer import Trainer, TrainingArgs
from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.forward_tts import ForwardTTS from TTS.tts.models.forward_tts import ForwardTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.dirname(os.path.abspath(__file__))
@ -38,9 +39,9 @@ config = SpeedySpeechConfig(
epochs=1000, epochs=1000,
text_cleaner="english_cleaners", text_cleaner="english_cleaners",
use_phonemes=True, use_phonemes=True,
use_espeak_phonemes=False,
phoneme_language="en-us", phoneme_language="en-us",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
precompute_num_workers=4,
print_step=50, print_step=50,
print_eval=False, print_eval=False,
mixed_precision=False, mixed_precision=False,
@ -50,14 +51,22 @@ config = SpeedySpeechConfig(
datasets=[dataset_config], datasets=[dataset_config],
) )
# # compute alignments # INITIALIZE THE AUDIO PROCESSOR
# if not config.model_args.use_aligner: # Audio processor is used for feature extraction and audio I/O.
# manager = ModelManager() # It mainly serves to the dataloader and the training loggers.
# model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA") ap = AudioProcessor.init_from_config(config)
# # TODO: make compute_attention python callable
# os.system( # INITIALIZE THE TOKENIZER
# f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/ --use_cuda true" # Tokenizer is used to convert text to sequences of token IDs.
# ) # If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
# init audio processor # init audio processor
ap = AudioProcessor(**config.audio.to_dict()) ap = AudioProcessor(**config.audio.to_dict())
@ -66,16 +75,14 @@ ap = AudioProcessor(**config.audio.to_dict())
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
# init model # init model
model = ForwardTTS(config) model = ForwardTTS(config, ap, tokenizer)
# init the trainer and 🚀 # INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer( trainer = Trainer(
TrainingArgs(), TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
training_assets={"audio_processor": ap},
) )
# AND... 3,2,1... 🚀
trainer.fit() trainer.fit()

View File

@ -6,6 +6,7 @@ from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.tacotron2_config import Tacotron2Config from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.tacotron2 import Tacotron2 from TTS.tts.models.tacotron2 import Tacotron2
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
# from TTS.tts.datasets.tokenizer import Tokenizer # from TTS.tts.datasets.tokenizer import Tokenizer
@ -60,23 +61,35 @@ config = Tacotron2Config( # This is the config that is saved for the future use
datasets=[dataset_config], datasets=[dataset_config],
) )
# init audio processor # INITIALIZE THE AUDIO PROCESSOR
ap = AudioProcessor(**config.audio.to_dict()) # Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
# load training samples # INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
# init model # INITIALIZE THE MODEL
model = Tacotron2(config) # Models take a config object and a speaker manager as input
# Config defines the details of the model like the number of layers, the size of the embedding, etc.
# Speaker manager is used by multi-speaker models.
model = Tacotron2(config, ap, tokenizer)
# init the trainer and 🚀 # INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer( trainer = Trainer(
TrainingArgs(), TrainingArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
training_assets={"audio_processor": ap},
) )
# AND... 3,2,1... 🚀
trainer.fit() trainer.fit()

View File

@ -6,6 +6,7 @@ from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.tacotron2_config import Tacotron2Config from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.tacotron2 import Tacotron2 from TTS.tts.models.tacotron2 import Tacotron2
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
# from TTS.tts.datasets.tokenizer import Tokenizer # from TTS.tts.datasets.tokenizer import Tokenizer
@ -46,6 +47,7 @@ config = Tacotron2Config( # This is the config that is saved for the future use
use_phonemes=True, use_phonemes=True,
phoneme_language="en-us", phoneme_language="en-us",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
precompute_num_workers=8,
print_step=25, print_step=25,
print_eval=True, print_eval=True,
mixed_precision=False, mixed_precision=False,
@ -56,11 +58,28 @@ config = Tacotron2Config( # This is the config that is saved for the future use
# init audio processor # init audio processor
ap = AudioProcessor(**config.audio.to_dict()) ap = AudioProcessor(**config.audio.to_dict())
# load training samples # INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
# init model # INITIALIZE THE MODEL
model = Tacotron2(config) # Models take a config object and a speaker manager as input
# Config defines the details of the model like the number of layers, the size of the embedding, etc.
# Speaker manager is used by multi-speaker models.
model = Tacotron2(config, ap, tokenizer, speaker_manager=None)
# init the trainer and 🚀 # init the trainer and 🚀
trainer = Trainer( trainer = Trainer(

View File

@ -33,7 +33,7 @@ audio_config = BaseAudioConfig(
config = VitsConfig( config = VitsConfig(
audio=audio_config, audio=audio_config,
run_name="vits_ljspeech", run_name="vits_ljspeech",
batch_size=48, batch_size=16,
eval_batch_size=16, eval_batch_size=16,
batch_group_size=5, batch_group_size=5,
num_loader_workers=0, num_loader_workers=0,
@ -48,7 +48,7 @@ config = VitsConfig(
compute_input_seq_cache=True, compute_input_seq_cache=True,
print_step=25, print_step=25,
print_eval=True, print_eval=True,
mixed_precision=True, mixed_precision=False,
max_seq_len=500000, max_seq_len=500000,
output_path=output_path, output_path=output_path,
datasets=[dataset_config], datasets=[dataset_config],
@ -61,7 +61,8 @@ ap = AudioProcessor.init_from_config(config)
# INITIALIZE THE TOKENIZER # INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs. # Tokenizer is used to convert text to sequences of token IDs.
tokenizer = TTSTokenizer.init_from_config(config) # config is updated with the default characters if not defined in the config.
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES # LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]``` # Each sample is a list of ```[text, audio_file_path, speaker_name]```

View File

@ -7,6 +7,7 @@ from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits, VitsArgs from TTS.tts.models.vits import Vits, VitsArgs
from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.dirname(os.path.abspath(__file__))
@ -63,10 +64,21 @@ config = VitsConfig(
datasets=[dataset_config], datasets=[dataset_config],
) )
# init audio processor # INITIALIZE THE AUDIO PROCESSOR
ap = AudioProcessor(**config.audio.to_dict()) # Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
# load training samples # INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# config is updated with the default characters if not defined in the config.
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
# init speaker manager for multi-speaker training # init speaker manager for multi-speaker training
@ -76,7 +88,7 @@ speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
config.model_args.num_speakers = speaker_manager.num_speakers config.model_args.num_speakers = speaker_manager.num_speakers
# init model # init model
model = Vits(config, speaker_manager) model = Vits(config, ap, tokenizer, speaker_manager)
# init the trainer and 🚀 # init the trainer and 🚀
trainer = Trainer( trainer = Trainer(
@ -86,6 +98,5 @@ trainer = Trainer(
model=model, model=model,
train_samples=train_samples, train_samples=train_samples,
eval_samples=eval_samples, eval_samples=eval_samples,
training_assets={"audio_processor": ap},
) )
trainer.fit() trainer.fit()