mirror of https://github.com/coqui-ai/TTS.git
Recipe for Belarusian TTS (#2756)
* Changes from jhlfrfufyfn <jhlfrfufyfn@gmail.com> * Recipe for Belarusian TTS --------- Co-authored-by: jhlfrfufyfn <jhlfrfufyfn@gmail.com>
This commit is contained in:
parent
c140df5a58
commit
d124f78430
|
@ -601,3 +601,15 @@ def kss(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
|
|||
text = cols[2] # cols[1] => 6월, cols[2] => 유월
|
||||
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
|
||||
return items
|
||||
|
||||
def bel_tts_formatter(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
|
||||
txt_file = os.path.join(root_path, meta_file)
|
||||
items = []
|
||||
speaker_name = "bel_tts"
|
||||
with open(txt_file, "r", encoding="utf-8") as ttf:
|
||||
for line in ttf:
|
||||
cols = line.split("|")
|
||||
wav_file = os.path.join(root_path, cols[0])
|
||||
text = cols[1]
|
||||
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
|
||||
return items
|
||||
|
|
|
@ -163,3 +163,8 @@ def multilingual_cleaners(text):
|
|||
text = remove_aux_symbols(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
def no_cleaners(text):
|
||||
# remove newline characters
|
||||
text = text.replace("\n", "")
|
||||
return text
|
|
@ -37,6 +37,8 @@ def get_git_branch():
|
|||
current = "inside_docker"
|
||||
except FileNotFoundError:
|
||||
current = "unknown"
|
||||
except StopIteration:
|
||||
current = "unknown"
|
||||
return current
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
/docker-prepare/*.txt
|
|
@ -0,0 +1,62 @@
|
|||
This description was created based on [jhlfrfufyfn/ml-bel-tts](https://github.com/jhlfrfufyfn/ml-bel-tts). Thanks a lot to jhlfrfufyfn for advices, configuration, code and ideas.
|
||||
|
||||
# Training
|
||||
|
||||
This recipe uses [CommonVoice](https://commonvoice.mozilla.org/en/datasets) dataset. It has format mp3/32kHz/48kbps format and contains multiple speakers because it was created for voice recognition. Looks like it's the best voice corpus of Belarussian language for today. But for creating better voice synthesis it will require to record some specific corpus with good pronunciation and good record quality.
|
||||
|
||||
Looks like for Belarusian Common Voice corpus there is no sense to train full big dataset (90 hours). It's enough 30 hours dataset, that makes very good progress for 350 epochs(24000 steps on 24GiB GPU). The quality of dataset is more important that size.
|
||||
|
||||
To train a model, you need to:
|
||||
- download code and data
|
||||
- prepare training data and generate scale_stats file
|
||||
- change configuration settings
|
||||
- train TTS model (GlowTTS in this example)
|
||||
- train Vocoder model (HiFiGAN in this example)
|
||||
|
||||
We recommend to prepare all things locally, then train models on the external computer with fast GPU. Text below describes all these steps.
|
||||
|
||||
## Download code and data
|
||||
|
||||
It would be good to place all things into local folder like /mycomputer/. You need files:
|
||||
|
||||
- Coqui-TTS - code from this git. For example, to /mycomputer/TTS/. *Expected result: you have /mycomputer/TTS/setup.py and other files from git.*
|
||||
- [Common voice dataset](https://commonvoice.mozilla.org/en/datasets) into cv-corpus/ directory near Coqui-TTS. *Expected result: you have /mycomputer/cv-corpus/be/validated.tsv and more than 1 mln .mp3 files in the /mycomputer/cv-corpus/be/clips/.*
|
||||
- Belarusian text to phonemes converter - fanetyka.jar from the [https://github.com/alex73/Software-Korpus/releases](https://github.com/alex73/Software-Korpus/releases), then place it to fanetyka/ near Coqui-TTS. *Expected result: you have file /mycomputer/fanetyka/fanetyka.jar*
|
||||
|
||||
Prepared data will be stored into storage/ directory near Coqui-TTS, like /mycomputer/storage/.
|
||||
|
||||
## Prepare to training - locally
|
||||
|
||||
Docker container was created for simplify local running. You can run `docker-prepare-start.sh` to start environment. All commands below should be started in docker console.
|
||||
|
||||
* Start jupyter by the command `jupyter notebook --no-browser --allow-root --port=2525 --ip=0.0.0.0`. It will display link to http. You need to open this link, then choose `recipes/bel-alex73/choose_speaker.ipynb` notebook. You should run cells one-by-one, listen different speakers and select speaker that you want to use. After all commands in notebook, you can press Ctrl+C in docker console to stop jupyter. *Expected result: directory /mycomputer/storage/filtered_dataset/ with df_speaker.csv file and many *.wav files.*
|
||||
|
||||
* Convert text to phonemes: `java -cp /a/fanetyka/fanetyka.jar org.alex73.fanetyka.impl.FanetykaTTSPrepare /storage/filtered_dataset/df_speaker.csv /storage/filtered_dataset/ipa_final_dataset.csv`. It will display all used characters at the end. You can use these characters to modify config in train_glowtts.py. *Expected result: file /mycomputer/storage/filtered_dataset/ipa_final_dataset.csv*
|
||||
|
||||
* Modify configs(if you need) in the train_glowtts.py and train_hifigan.py. Then export config to old json format to create scale_stats.npy by the command `python3 recipes/bel-alex73/dump_config.py > recipes/bel-alex73/config.json`. *Expected result: file /mycomputer/TTS/recipes/bel-alex73/config.json exists.*
|
||||
|
||||
* Start scale_stats.npy, that will the model to learn better: `mkdir -p /storage/TTS/; python3 TTS/bin/compute_statistics.py --config_path recipes/bel-alex73/config.json --out_path /storage/TTS/scale_stats.npy`. *Expected result: file /mycomputer/storage/TTS/scale_stats.npy exists.*
|
||||
|
||||
## Training - with GPU
|
||||
|
||||
You need to upload Coqui-TTS(/mycomputer/TTS/) and storage/ directory(/mycomputer/storage/) to some computer with GPU. We don't need cv-corpus/ and fanetyka/ directories for training. Install gcc, then run `pip install -e .[all,dev,notebooks]` to prepare modules. GlowTTS and HifiGan models should be learned separately based on /storage/filtered_dataset only, i.e. they are not dependent from each other. <devices> below means list of GPU ids from zero("0,1,2,3" for systems with 4 GPU). See details on the https://tts.readthedocs.io/en/latest/tutorial_for_nervous_beginners.html(multi-gpu training).
|
||||
|
||||
Current setup created for 24GiB GPU. You need to change batch_size if you have more or less GPU memory. Also, you can try to set lr(learning rate) to lower value in the end of training GlowTTS.
|
||||
|
||||
* Start GlowTTS model training by the command `OMP_NUM_THREADS=2 CUDA_VISIBLE_DEVICES=<devices> python3 -m trainer.distribute --script recipes/bel-alex73/train_glowtts.py`. It will produce training data into storage/output/ directory. Usually 100.000 global steps required. *Expected behavior: You will see /storage/output-glowtts/<start_date>/best_model_<step>.pth files.*
|
||||
|
||||
* Start HiFiGAN model training by the command `OMP_NUM_THREADS=2 CUDA_VISIBLE_DEVICES=<devices> python3 -m trainer.distribute --script recipes/bel-alex73/train_hifigan.py`. *Expected behavior: You will see /storage/output-hifigan/<start_date>/best_model_<step>.pth files.*
|
||||
|
||||
## How to monitor training
|
||||
|
||||
* Run `nvidia-smi` to be sure that training uses all GPUs and to be sure that you are using more than 90% GPU memory and utilization.
|
||||
|
||||
* Run `tensorboard --logdir=/storage/output-<model>/` to see alignment, avg_loss metrics and check audio evaluation. You need only events.out.tfevents.\* files for that.
|
||||
|
||||
## Synthesizing speech
|
||||
|
||||
tts --text "<phonemes>" --out_path output.wav \
|
||||
--config_path /storage/output-glowtts/run/config.json \
|
||||
--model_path /storage/output-glowtts/run/best_model.pth \
|
||||
--vocoder_config_path /storage/output-hifigan/run/config.json \
|
||||
--vocoder_path /storage/output-hifigan/run/best_model.pth
|
|
@ -0,0 +1,346 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# This file was created by jhlfrfufyfn for choose speaker from the Belarusian Mozilla Voice corpus\n",
|
||||
"#\n",
|
||||
"#\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import seaborn as sns\n",
|
||||
"import os\n",
|
||||
"import librosa"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# unpackage tar gz file cv-corpus-12.0-2022-12-07-be.tar.gz\n",
|
||||
"# import tarfile\n",
|
||||
"# tar = tarfile.open(\"cv-corpus-12.0-2022-12-07-be.tar.gz\", \"r:gz\")\n",
|
||||
"# tar.extractall()\n",
|
||||
"# tar.close()\n",
|
||||
"\n",
|
||||
"corpuspath = '/a/cv-corpus'\n",
|
||||
"outputpath = '/storage/filtered_dataset'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# open validated.tsv\n",
|
||||
"df = pd.read_csv(corpuspath+'/be/validated.tsv', sep='\\t' ,low_memory=False)\n",
|
||||
"df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# drop from df columns age, accents\n",
|
||||
"df = df.drop(['age', 'accents', 'gender', 'variant', 'locale', 'segment'], axis=1)\n",
|
||||
"df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# count number of recordes with down_votes > 0\n",
|
||||
"df[df['down_votes'] > 0].count()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# count number of recordes with up_votes == 0\n",
|
||||
"df[df['up_votes'] == 0].count()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# drop all rows with down_votes > 0 and up_votes == 0\n",
|
||||
"df = df[df['down_votes'] == 0]\n",
|
||||
"df = df[df['up_votes'] > 0]\n",
|
||||
"df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# drop column down_votes and up_votes\n",
|
||||
"df = df.drop(['down_votes', 'up_votes'], axis=1)\n",
|
||||
"df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# sort by count\n",
|
||||
"df_sorted = df.groupby('client_id').count().sort_values(by='path', ascending=False)\n",
|
||||
"df_sorted"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# get top 10 speakers\n",
|
||||
"top_10_speakers = df_sorted.head(10)\n",
|
||||
"top_10_speakers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# get for the first speaker ten random paths to audio files\n",
|
||||
"def get_speaker_audio_list(speaker_id, n=10):\n",
|
||||
" return df[df['client_id'] == speaker_id].sample(n)['path'].values.tolist()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# CHOOSE : which speaker will we use\n",
|
||||
"speaker_index = 0\n",
|
||||
"speaker_audio_list = get_speaker_audio_list(top_10_speakers.index[speaker_index])\n",
|
||||
"print(speaker_audio_list)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# open audio files from speaker_audio_list and play them\n",
|
||||
"# audio files lie in cv-corpus-12.0-2022-12-07/be/clips\n",
|
||||
"import IPython.display as ipd\n",
|
||||
"for audio in speaker_audio_list:\n",
|
||||
" audio = corpuspath+'/be/clips/' + audio\n",
|
||||
" audio_data = ipd.Audio(audio)\n",
|
||||
" display(audio_data)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# 0 is pretty good\n",
|
||||
"# 1 is bad\n",
|
||||
"# 2 is partly 0, other are different\n",
|
||||
"# 3 is bad\n",
|
||||
"# 4 is pretty fast and clear, but not good\n",
|
||||
"# 5 is echoing, sometimes mic cracks\n",
|
||||
"# 6 is really slow and clear, but accent?\n",
|
||||
"# 7 has a lot of intonation, but is pretty clear\n",
|
||||
"# 8 is clear and slow, sometimes little mic crack\n",
|
||||
"# 9 has background noise, whispering\n",
|
||||
"\n",
|
||||
"# options: 0, 6, 8"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# calculate speech rate in words per minute for each speaker\n",
|
||||
"def get_speech_rate(speaker_id):\n",
|
||||
" df_speaker = df[df['client_id'] == speaker_id]\n",
|
||||
" # get 1000 random samples to calculate speech rate\n",
|
||||
" df_speaker = df_speaker.sample(1000)\n",
|
||||
" # get duration of each audio file\n",
|
||||
" df_speaker['duration'] = df_speaker['path'].apply(lambda x: librosa.get_duration(path=corpuspath+'/be/clips/' + x))\n",
|
||||
" # get number of words in each audio file\n",
|
||||
" df_speaker['words'] = df_speaker['sentence'].apply(lambda x: len(x.split()))\n",
|
||||
" # calculate speech rate\n",
|
||||
" df_speaker['speech_rate'] = df_speaker['words'] / df_speaker['duration'] * 60\n",
|
||||
" # return mean speech rate\n",
|
||||
" return df_speaker['speech_rate'].mean()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# calculate speech rate for each speaker\n",
|
||||
"print(f'Speech rate for speaker {speaker_index}: ', get_speech_rate(top_10_speakers.index[speaker_index]))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_average_duration(df_speaker):\n",
|
||||
" # get 1000 random samples to calculate speech rate\n",
|
||||
" df_speaker = df_speaker.sample(1000)\n",
|
||||
" # get duration of each audio file\n",
|
||||
" df_speaker['duration'] = df_speaker['path'].apply(lambda x: librosa.get_duration(path=corpuspath+'/be/clips/' + x))\n",
|
||||
" return df_speaker['duration'].mean()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_speaker = df[df['client_id'] == top_10_speakers.index[speaker_index]]\n",
|
||||
"\n",
|
||||
"avg_duration = get_average_duration(df_speaker)\n",
|
||||
"avg_total_duration = avg_duration * len(df_speaker.index)\n",
|
||||
"print(f'Average duration for speaker {speaker_index}: ', avg_duration, \", average total duration(hours): \",(avg_total_duration/60.0/60.0))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# get df with speaker_index speaker \n",
|
||||
"df_speaker = df[df['client_id'] == top_10_speakers.index[speaker_index]]\n",
|
||||
"df_speaker = df_speaker.drop(['client_id'], axis=1)\n",
|
||||
"\n",
|
||||
"# get only x latest hours\n",
|
||||
"limit_hours = 30\n",
|
||||
"limit_files = round(limit_hours*60*60 / avg_duration)\n",
|
||||
"df_speaker = df_speaker.tail(limit_files)\n",
|
||||
"\n",
|
||||
"df_speaker"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# # move all files of that speaker to another folder\n",
|
||||
"# # use multiprocessing to speed up\n",
|
||||
"# # add progress bar\n",
|
||||
"# from tqdm import tqdm\n",
|
||||
"# import multiprocessing\n",
|
||||
"# from multiprocessing import Pool\n",
|
||||
"# import shutil\n",
|
||||
"\n",
|
||||
"# def move_file(file):\n",
|
||||
"# shutil.move(corpuspath+'/be/clips/' + file, corpuspath+'/be/speaker_0/' + file)\n",
|
||||
"\n",
|
||||
"# # get list of files to move\n",
|
||||
"# files = df_speaker['path'].values.tolist()\n",
|
||||
"\n",
|
||||
"# # move files\n",
|
||||
"# with Pool(multiprocessing.cpu_count()) as p:\n",
|
||||
"# r = list(tqdm(p.imap(move_file, files), total=len(files)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# cleanup output and save text lines to csv\n",
|
||||
"if os.path.isdir(outputpath):\n",
|
||||
" for file in os.scandir(outputpath):\n",
|
||||
" os.remove(file.path)\n",
|
||||
"else:\n",
|
||||
" os.mkdir(outputpath)\n",
|
||||
"\n",
|
||||
"df_speaker['path2'] = df_speaker['path'].str.replace('\\.mp3$','.wav', regex=True)\n",
|
||||
"df_speaker[['path2','sentence']].to_csv(outputpath+'/df_speaker.csv', sep='|', header=False, index=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# make rate=22050 of all mp3 files in speaker_0 folder with multiprocessing and tqdm\n",
|
||||
"import multiprocessing\n",
|
||||
"from multiprocessing import Pool\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"from pydub import AudioSegment\n",
|
||||
"\n",
|
||||
"def convert_mp3_to_wav(file):\n",
|
||||
" sound = AudioSegment.from_mp3(corpuspath+'/be/clips/' + file)\n",
|
||||
" sound = sound.set_frame_rate(22050)\n",
|
||||
" sound.export(outputpath+'/' + file[:-4] + '.wav', format='wav')\n",
|
||||
"\n",
|
||||
"# get list of files to convert\n",
|
||||
"files = df_speaker['path'].values.tolist()\n",
|
||||
"\n",
|
||||
"# convert files\n",
|
||||
"with Pool(multiprocessing.cpu_count()) as p:\n",
|
||||
" r = list(tqdm(p.imap(convert_mp3_to_wav, files), total=len(files)))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
#!/bin/bash
|
||||
set -x
|
||||
|
||||
cd $( dirname -- "$0"; )
|
||||
|
||||
cp ../../requirements*.txt docker-prepare/
|
||||
|
||||
docker build -t tts-learn -f docker-prepare/Dockerfile docker-prepare/
|
||||
|
||||
mkdir -p ../../../storage
|
||||
docker run --rm -it \
|
||||
-p 2525:2525 \
|
||||
--shm-size=256M \
|
||||
--name tts-learn-run \
|
||||
-v $(pwd)/../../:/a/TTS \
|
||||
-v $(pwd)/../../../cv-corpus:/a/cv-corpus \
|
||||
-v $(pwd)/../../../fanetyka/:/a/fanetyka/ \
|
||||
-v $(pwd)/../../../storage:/storage \
|
||||
tts-learn
|
|
@ -0,0 +1,18 @@
|
|||
FROM ubuntu:22.04
|
||||
|
||||
RUN apt -y update
|
||||
RUN apt -y upgrade
|
||||
RUN apt -y install --no-install-recommends pip ffmpeg openjdk-19-jre-headless
|
||||
|
||||
RUN mkdir /a/
|
||||
ADD requirements*.txt /a/
|
||||
WORKDIR /a/
|
||||
RUN pip install -r requirements.txt -r requirements.dev.txt -r requirements.notebooks.txt
|
||||
RUN pip install seaborn pydub notebook
|
||||
|
||||
RUN apt -y install --no-install-recommends gcc libpython3.10-dev
|
||||
|
||||
ADD runtime.sh /a/
|
||||
|
||||
WORKDIR /a/TTS/
|
||||
CMD /a/runtime.sh
|
|
@ -0,0 +1,6 @@
|
|||
#!/bin/bash
|
||||
|
||||
cd /a/TTS
|
||||
pip install -e .[all,dev,notebooks]
|
||||
|
||||
LANG=C.utf8 bash
|
|
@ -0,0 +1,7 @@
|
|||
from train_glowtts import config
|
||||
import json
|
||||
import re
|
||||
|
||||
s = json.dumps(config, default=vars, indent=2)
|
||||
s = re.sub(r'"test_sentences":\s*\[\],', '', s)
|
||||
print(s)
|
|
@ -0,0 +1,111 @@
|
|||
import os
|
||||
|
||||
# Trainer: Where the ✨️ happens.
|
||||
# TrainingArgs: Defines the set of arguments of the Trainer.
|
||||
from trainer import Trainer, TrainerArgs
|
||||
from TTS.tts.configs.shared_configs import BaseAudioConfig
|
||||
|
||||
# GlowTTSConfig: all model related values for training, validating and testing.
|
||||
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
|
||||
|
||||
# BaseDatasetConfig: defines name, formatter and path of the dataset.
|
||||
from TTS.tts.configs.shared_configs import BaseDatasetConfig, CharactersConfig
|
||||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.tts.models.glow_tts import GlowTTS
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
|
||||
# we use the same path as this script as our training folder.
|
||||
output_path = '/storage/output-glowtts/'
|
||||
|
||||
|
||||
# DEFINE DATASET CONFIG
|
||||
# Set LJSpeech as our target dataset and define its path.
|
||||
# You can also use a simple Dict to define the dataset and pass it to your custom formatter.
|
||||
dataset_config = BaseDatasetConfig(
|
||||
formatter="bel_tts_formatter", meta_file_train="ipa_final_dataset.csv", path=os.path.join(output_path, "/storage/filtered_dataset/")
|
||||
)
|
||||
|
||||
characters=CharactersConfig(
|
||||
characters_class="TTS.tts.utils.text.characters.Graphemes",
|
||||
pad="_",
|
||||
eos="~",
|
||||
bos="^",
|
||||
blank="@",
|
||||
characters="Iabdfgijklmnprstuvxzɔɛɣɨɫɱʂʐʲˈː̯͡β",
|
||||
punctuations="!,.?: -‒–—…",
|
||||
)
|
||||
|
||||
audio_config = BaseAudioConfig(
|
||||
mel_fmin=50,
|
||||
mel_fmax=8000,
|
||||
hop_length=256,
|
||||
stats_path="/storage/TTS/scale_stats.npy",
|
||||
)
|
||||
|
||||
# INITIALIZE THE TRAINING CONFIGURATION
|
||||
# Configure the model. Every config class inherits the BaseTTSConfig.
|
||||
config = GlowTTSConfig(
|
||||
batch_size=96,
|
||||
eval_batch_size=32,
|
||||
num_loader_workers=8,
|
||||
num_eval_loader_workers=8,
|
||||
use_noise_augment=True,
|
||||
run_eval=True,
|
||||
test_delay_epochs=-1,
|
||||
epochs=1000,
|
||||
print_step=50,
|
||||
print_eval=True,
|
||||
output_path=output_path,
|
||||
add_blank=True,
|
||||
datasets=[dataset_config],
|
||||
characters=characters,
|
||||
enable_eos_bos_chars=True,
|
||||
mixed_precision=False,
|
||||
save_step=10000,
|
||||
save_n_checkpoints=2,
|
||||
save_best_after=5000,
|
||||
text_cleaner="no_cleaners",
|
||||
audio=audio_config,
|
||||
test_sentences=[],
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# INITIALIZE THE AUDIO PROCESSOR
|
||||
# Audio processor is used for feature extraction and audio I/O.
|
||||
# It mainly serves to the dataloader and the training loggers.
|
||||
ap = AudioProcessor.init_from_config(config)
|
||||
|
||||
# INITIALIZE THE TOKENIZER
|
||||
# Tokenizer is used to convert text to sequences of token IDs.
|
||||
# If characters are not defined in the config, default characters are passed to the config
|
||||
tokenizer, config = TTSTokenizer.init_from_config(config)
|
||||
|
||||
# LOAD DATA SAMPLES
|
||||
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
|
||||
# You can define your custom sample loader returning the list of samples.
|
||||
# Or define your custom formatter and pass it to the `load_tts_samples`.
|
||||
# Check `TTS.tts.datasets.load_tts_samples` for more details.
|
||||
train_samples, eval_samples = load_tts_samples(
|
||||
dataset_config,
|
||||
eval_split=True,
|
||||
eval_split_max_size=config.eval_split_max_size,
|
||||
eval_split_size=config.eval_split_size,
|
||||
)
|
||||
|
||||
# INITIALIZE THE MODEL
|
||||
# Models take a config object and a speaker manager as input
|
||||
# Config defines the details of the model like the number of layers, the size of the embedding, etc.
|
||||
# Speaker manager is used by multi-speaker models.
|
||||
model = GlowTTS(config, ap, tokenizer, speaker_manager=None)
|
||||
|
||||
# INITIALIZE THE TRAINER
|
||||
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
|
||||
# distributed training, etc.
|
||||
trainer = Trainer(
|
||||
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
|
||||
)
|
||||
|
||||
# AND... 3,2,1... 🚀
|
||||
trainer.fit()
|
|
@ -0,0 +1,60 @@
|
|||
import os
|
||||
|
||||
from trainer import Trainer, TrainerArgs
|
||||
from TTS.tts.configs.shared_configs import BaseAudioConfig
|
||||
from coqpit import Coqpit
|
||||
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.vocoder.configs.hifigan_config import *;
|
||||
from TTS.vocoder.datasets.preprocess import load_wav_data
|
||||
from TTS.vocoder.models.gan import GAN
|
||||
|
||||
output_path = '/storage/output-hifigan/'
|
||||
|
||||
audio_config = BaseAudioConfig(
|
||||
mel_fmin=50,
|
||||
mel_fmax=8000,
|
||||
hop_length=256,
|
||||
stats_path="/storage/TTS/scale_stats.npy",
|
||||
)
|
||||
|
||||
config = HifiganConfig(
|
||||
batch_size=74,
|
||||
eval_batch_size=16,
|
||||
num_loader_workers=8,
|
||||
num_eval_loader_workers=8,
|
||||
lr_disc=0.0002,
|
||||
lr_gen=0.0002,
|
||||
run_eval=True,
|
||||
test_delay_epochs=5,
|
||||
epochs=1000,
|
||||
use_noise_augment=True,
|
||||
seq_len=8192,
|
||||
pad_short=2000,
|
||||
save_step=5000,
|
||||
print_step=50,
|
||||
print_eval=True,
|
||||
mixed_precision=False,
|
||||
eval_split_size=30,
|
||||
save_n_checkpoints=2,
|
||||
save_best_after=5000,
|
||||
data_path="/storage/filtered_dataset",
|
||||
output_path=output_path,
|
||||
audio=audio_config,
|
||||
)
|
||||
|
||||
# init audio processor
|
||||
ap = AudioProcessor.init_from_config(config)
|
||||
|
||||
# load training samples
|
||||
print("config.eval_split_size = ", config.eval_split_size)
|
||||
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
|
||||
|
||||
# init model
|
||||
model = GAN(config, ap)
|
||||
|
||||
# init the trainer and 🚀
|
||||
trainer = Trainer(
|
||||
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
|
||||
)
|
||||
trainer.fit()
|
Loading…
Reference in New Issue