coqui-tts/recipes/vctk/resnet_speaker_encoder/train_encoder.py

import os

from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig

# from TTS.encoder.configs.emotion_encoder_config import EmotionEncoderConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig

CURRENT_PATH = os.getcwd()
# change the root path to the TTS root path
os.chdir("../../../")

### Definitions ###
# dataset
VCTK_PATH = "/raid/datasets/VCTK_NEW_16khz_removed_silence_silero_vad/"  # download:  https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zipdddddddddd
RIR_SIMULATED_PATH = "/raid/datasets/DA/RIRS_NOISES/simulated_rirs/"  # download: https://www.openslr.org/17/
MUSAN_PATH = "/raid/datasets/DA/musan/"  # download: https://www.openslr.org/17/

# training
OUTPUT_PATH = os.path.join(
    CURRENT_PATH, "resnet_speaker_encoder_training_output/"
)  # path to save the train logs and checkpoint
CONFIG_OUT_PATH = os.path.join(OUTPUT_PATH, "config_se.json")
RESTORE_PATH = None  # Checkpoint to use for transfer learning if None ignore

# instance the config
# to speaker encoder
config = SpeakerEncoderConfig()
# to emotion encoder
# config = EmotionEncoderConfig()


#### DATASET CONFIG ####
# The formatter need to return the key "speaker_name"  for the speaker encoder and the "emotion_name" for the emotion encoder
dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", language="en-us", path=VCTK_PATH)

# add the dataset to the config
config.datasets = [dataset_config]


#### TRAINING CONFIG ####
# The encoder data loader balancer the dataset item equally to guarantee better training and to attend the losses requirements
# It have two parameters to control the final batch size the number total of speaker used in each batch and the number of samples for each speaker

# number total of speaker in batch in training
config.num_classes_in_batch = 100
# number of utterance per class/speaker in the batch in training
config.num_utter_per_class = 4
# final batch size = config.num_classes_in_batch * config.num_utter_per_class

# number total of speaker in batch in evaluation
config.eval_num_classes_in_batch = 100
# number of utterance per class/speaker in the batch in evaluation
config.eval_num_utter_per_class = 4

# number of data loader workers
config.num_loader_workers = 8
config.num_val_loader_workers = 8

# number of epochs
config.epochs = 10000
# loss to be used in training
config.loss = "softmaxproto"

# run eval
config.run_eval = False

# output path for the checkpoints
config.output_path = OUTPUT_PATH

# Save local checkpoint every save_step steps
config.save_step = 2000

### Model Config ###
config.model_params = {
    "model_name": "resnet",  # supported "lstm" and "resnet"
    "input_dim": 64,
    "use_torch_spec": True,
    "log_input": True,
    "proj_dim": 512,  # embedding dim
}

### Audio Config ###
# To fast train the model divides the audio in small parts. it parameter defines the length in seconds of these "parts"
config.voice_len = 2.0
# all others configs
config.audio = {
    "fft_size": 512,
    "win_length": 400,
    "hop_length": 160,
    "frame_shift_ms": None,
    "frame_length_ms": None,
    "stft_pad_mode": "reflect",
    "sample_rate": 16000,
    "resample": False,
    "preemphasis": 0.97,
    "ref_level_db": 20,
    "do_sound_norm": False,
    "do_trim_silence": False,
    "trim_db": 60,
    "power": 1.5,
    "griffin_lim_iters": 60,
    "num_mels": 64,
    "mel_fmin": 0.0,
    "mel_fmax": 8000.0,
    "spec_gain": 20,
    "signal_norm": False,
    "min_level_db": -100,
    "symmetric_norm": False,
    "max_norm": 4.0,
    "clip_norm": False,
    "stats_path": None,
    "do_rms_norm": True,
    "db_level": -27.0,
}


### Augmentation Config ###
config.audio_augmentation = {
    # additive noise and room impulse response (RIR) simulation similar to: https://arxiv.org/pdf/2009.14153.pdf
    "p": 0.5,  # probability to the use of one of the augmentation - 0 means disabled
    "rir": {"rir_path": RIR_SIMULATED_PATH, "conv_mode": "full"},  # download: https://www.openslr.org/17/
    "additive": {
        "sounds_path": MUSAN_PATH,
        "speech": {"min_snr_in_db": 13, "max_snr_in_db": 20, "min_num_noises": 1, "max_num_noises": 1},
        "noise": {"min_snr_in_db": 0, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1},
        "music": {"min_snr_in_db": 5, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1},
    },
    "gaussian": {"p": 0.7, "min_amplitude": 0.0, "max_amplitude": 1e-05},
}

config.save_json(CONFIG_OUT_PATH)

print(CONFIG_OUT_PATH)
if RESTORE_PATH is not None:
    command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH} --restore_path {RESTORE_PATH}"
else:
    command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH}"

os.system(command)