From 8b4a37d8cb2630710c219df833551fc4fa69ba03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 30 Sep 2021 14:43:59 +0000 Subject: [PATCH] Add DeepSpeech LibriSpeech recipe --- .../stt/deep_speech/train_deep_speech.py | 96 +++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 recipes/librispeech/stt/deep_speech/train_deep_speech.py diff --git a/recipes/librispeech/stt/deep_speech/train_deep_speech.py b/recipes/librispeech/stt/deep_speech/train_deep_speech.py new file mode 100644 index 00000000..b691fc57 --- /dev/null +++ b/recipes/librispeech/stt/deep_speech/train_deep_speech.py @@ -0,0 +1,96 @@ +import os + +from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig +from TTS.stt.configs import DeepSpeechConfig +from TTS.stt.datasets import load_stt_samples +from TTS.stt.datasets.downloaders import download_librispeech +from TTS.stt.datasets.tokenizer import Tokenizer +from TTS.stt.models.deep_speech import DeepSpeech +from TTS.trainer import Trainer, TrainingArgs +from TTS.utils.audio import AudioProcessor + +output_path = os.path.dirname(os.path.abspath(__file__)) + +if not os.path.exists("/home/ubuntu/librispeech/LibriSpeech/train-clean-100"): + download_librispeech("/home/ubuntu/librispeech/", "train-clean-100") +if not os.path.exists("/home/ubuntu/librispeech/LibriSpeech/dev-clean"): + download_librispeech("/home/ubuntu/librispeech/", "dev-clean") + +# train_dataset_config = BaseDatasetConfig( +# name="librispeech", meta_file_train=None, path="/home/ubuntu/librispeech/LibriSpeech/train-clean-100" +# ) + +# eval_dataset_config = BaseDatasetConfig( +# name="librispeech", meta_file_train=None, path="/home/ubuntu/librispeech/LibriSpeech/dev-clean" +# ) + +train_dataset_config = BaseDatasetConfig( + name="ljspeech", + meta_file_train="metadata.csv", + path="/home/ubuntu/ljspeech/LJSpeech-1.1/", +) + + +audio_config = BaseAudioConfig( + sample_rate=16000, + resample=False, + frame_shift_ms=20, + frame_length_ms=40, + num_mels=26, + num_mfcc=26, + do_trim_silence=False, + mel_fmin=0, + mel_fmax=8000, +) + +config = DeepSpeechConfig( + audio=audio_config, + run_name="deepspeech_librispeech", + batch_size=128, + eval_batch_size=16, + batch_group_size=5, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + print_step=25, + print_eval=False, + mixed_precision=True, + max_seq_len=500000, + output_path=output_path, + train_datasets=[train_dataset_config], + # eval_datasets=[eval_dataset_config], +) + +# init audio processor +ap = AudioProcessor(**config.audio.to_dict()) + +# load training samples +train_samples, eval_samples = load_stt_samples(train_dataset_config, eval_split=True) +# eval_samples, _ = load_stt_samples(eval_dataset_config, eval_split=False) +transcripts = [s["text"] for s in train_samples] + +# init tokenizer +tokenizer = Tokenizer(transcripts=transcripts) +n_tokens = tokenizer.vocab_size +config.model_args.n_tokens = n_tokens +config.vocabulary = tokenizer.vocab_dict + +# init model +model = DeepSpeech(config) + +# init training and kick it 🚀 +# args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer( + TrainingArgs(), + config, + output_path, + model=model, + tokenizer=tokenizer, + train_samples=train_samples, + eval_samples=eval_samples, + cudnn_benchmark=False, + training_assets={"tokenizer": tokenizer, "audio_processor": ap}, +) +trainer.fit()