From 8310d19da8fe2de457cdcad787898d5e03711a38 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 12 Aug 2021 07:52:44 -0300 Subject: [PATCH 001/220] Save speakers embeddings/ids before starting training --- TTS/tts/models/vits.py | 1 - requirements.txt | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index bc459b7f..d1755b47 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -402,7 +402,6 @@ class Vits(BaseTTS): # speaker embedding if self.num_speakers > 1 and sid is not None: g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] - # posterior encoder z, m_q, logs_q, y_mask = self.posterior_encoder(y, y_lengths, g=g) diff --git a/requirements.txt b/requirements.txt index 3ec33ceb..d21a972f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,7 @@ pyyaml scipy>=0.19.0 soundfile tensorboardX -torch>=1.7 +torch>=1.9.0 tqdm numba==0.53 umap-learn==0.5.1 From 234a4aacb35d159461f446c709ffea6d95f5cdd6 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 12 Aug 2021 21:52:12 -0300 Subject: [PATCH 002/220] Select randomly a speaker from the speaker manager for the test setences --- TTS/tts/models/vits.py | 18 ++++++++---------- TTS/tts/utils/speakers.py | 8 ++++++++ 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index d1755b47..ae607c47 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -402,6 +402,7 @@ class Vits(BaseTTS): # speaker embedding if self.num_speakers > 1 and sid is not None: g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] + # posterior encoder z, m_q, logs_q, y_mask = self.posterior_encoder(y, y_lengths, g=g) @@ -638,7 +639,7 @@ class Vits(BaseTTS): return self._log(ap, batch, outputs, "eval") @torch.no_grad() - def test_run(self, ap) -> Tuple[Dict, Dict]: + def test_run(self, ap, eval_loader=None) -> Tuple[Dict, Dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. @@ -650,16 +651,13 @@ class Vits(BaseTTS): test_audios = {} test_figures = {} test_sentences = self.config.test_sentences - aux_inputs = { - "speaker_id": None - if not self.config.use_speaker_embedding - else random.sample(sorted(self.speaker_manager.speaker_ids.values()), 1), - "d_vector": None - if not self.config.use_d_vector_file - else random.samples(sorted(self.speaker_manager.d_vectors.values()), 1), - "style_wav": None, - } + if hasattr(self, "speaker_manager"): + aux_inputs = self.speaker_manager.get_random_speaker_aux_input() + else: + aux_inputs = self.get_aux_input() + for idx, sen in enumerate(test_sentences): + wav, alignment, _, _ = synthesis( self, sen, diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 13696a20..ae001155 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -209,6 +209,14 @@ class SpeakerManager: d_vectors = np.stack(d_vectors[:num_samples]).mean(0) return d_vectors + def get_random_speaker_aux_input(self) -> Dict: + if self.d_vectors: + return {"speaker_id": None, "style_wav": None, "d_vector": self.d_vectors[random.choices(list(self.d_vectors.keys()))[0]]["embedding"]} + elif self.speaker_ids: + return {"speaker_id": self.speaker_ids[random.choices(list(self.speaker_ids.keys()))[0]], "style_wav": None, "d_vector": None} + else: + return {"speaker_id": None, "style_wav": None, "d_vector": None} + def get_speakers(self) -> List: return self.speaker_ids From c9c196004022df7f335fb7fb37d58ead16fa7a96 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 12 Aug 2021 22:48:30 -0300 Subject: [PATCH 003/220] Allow ignore speakers for all multispeaker datasets --- TTS/tts/datasets/__init__.py | 5 ++- TTS/tts/datasets/formatters.py | 79 +++++++++++++++++++++------------- 2 files changed, 53 insertions(+), 31 deletions(-) diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index 4fae974f..741f92fd 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -67,16 +67,17 @@ def load_tts_samples( root_path = dataset["path"] meta_file_train = dataset["meta_file_train"] meta_file_val = dataset["meta_file_val"] + ununsed_speakers = dataset["ununsed_speakers"] # setup the right data processor if formatter is None: formatter = _get_formatter_by_name(name) # load train set - meta_data_train = formatter(root_path, meta_file_train) + meta_data_train = formatter(root_path, meta_file_train, ununsed_speakers=ununsed_speakers) print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}") # load evaluation split if set if eval_split: if meta_file_val: - meta_data_eval = formatter(root_path, meta_file_val) + meta_data_eval = formatter(root_path, meta_file_val, ununsed_speakers=ununsed_speakers) else: meta_data_eval, meta_data_train = split_dataset(meta_data_train) meta_data_eval_all += meta_data_eval diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 425eb0cd..bcbbb369 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -12,7 +12,7 @@ from tqdm import tqdm ######################## -def tweb(root_path, meta_file): +def tweb(root_path, meta_file, **kwargs): """Normalize TWEB dataset. https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset """ @@ -28,7 +28,7 @@ def tweb(root_path, meta_file): return items -def mozilla(root_path, meta_file): +def mozilla(root_path, meta_file, **kwargs): """Normalizes Mozilla meta data files to TTS format""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -43,7 +43,7 @@ def mozilla(root_path, meta_file): return items -def mozilla_de(root_path, meta_file): +def mozilla_de(root_path, meta_file, **kwargs): """Normalizes Mozilla meta data files to TTS format""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -82,6 +82,10 @@ def mailabs(root_path, meta_files=None): if speaker_name_match is None: continue speaker_name = speaker_name_match.group("speaker_name") + # ignore speakers + if isinstance(ununsed_speakers, list): + if speaker_name in ununsed_speakers: + continue print(" | > {}".format(csv_file)) with open(txt_file, "r", encoding="utf-8") as ttf: for line in ttf: @@ -98,7 +102,7 @@ def mailabs(root_path, meta_files=None): return items -def ljspeech(root_path, meta_file): +def ljspeech(root_path, meta_file, **kwargs): """Normalizes the LJSpeech meta data file to TTS format https://keithito.com/LJ-Speech-Dataset/""" txt_file = os.path.join(root_path, meta_file) @@ -113,7 +117,7 @@ def ljspeech(root_path, meta_file): return items -def ljspeech_test(root_path, meta_file): +def ljspeech_test(root_path, meta_file, **kwargs): """Normalizes the LJSpeech meta data file for TTS testing https://keithito.com/LJ-Speech-Dataset/""" txt_file = os.path.join(root_path, meta_file) @@ -127,7 +131,7 @@ def ljspeech_test(root_path, meta_file): return items -def sam_accenture(root_path, meta_file): +def sam_accenture(root_path, meta_file, **kwargs): """Normalizes the sam-accenture meta data file to TTS format https://github.com/Sam-Accenture-Non-Binary-Voice/non-binary-voice-files""" xml_file = os.path.join(root_path, "voice_over_recordings", meta_file) @@ -144,12 +148,12 @@ def sam_accenture(root_path, meta_file): return items -def ruslan(root_path, meta_file): +def ruslan(root_path, meta_file, **kwargs): """Normalizes the RUSLAN meta data file to TTS format https://ruslan-corpus.github.io/""" txt_file = os.path.join(root_path, meta_file) items = [] - speaker_name = "ljspeech" + speaker_name = "ruslan" with open(txt_file, "r", encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") @@ -159,11 +163,11 @@ def ruslan(root_path, meta_file): return items -def css10(root_path, meta_file): +def css10(root_path, meta_file, **kwargs): """Normalizes the CSS10 dataset file to TTS format""" txt_file = os.path.join(root_path, meta_file) items = [] - speaker_name = "ljspeech" + speaker_name = "css10" with open(txt_file, "r", encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") @@ -173,7 +177,7 @@ def css10(root_path, meta_file): return items -def nancy(root_path, meta_file): +def nancy(root_path, meta_file, **kwargs): """Normalizes the Nancy meta data file to TTS format""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -187,7 +191,7 @@ def nancy(root_path, meta_file): return items -def common_voice(root_path, meta_file): +def common_voice(root_path, meta_file, ununsed_speakers=None): """Normalize the common voice meta data file to TTS format.""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -198,12 +202,16 @@ def common_voice(root_path, meta_file): cols = line.split("\t") text = cols[2] speaker_name = cols[0] + # ignore speakers + if isinstance(ununsed_speakers, list): + if speaker_name in ununsed_speakers: + continue wav_file = os.path.join(root_path, "clips", cols[1].replace(".mp3", ".wav")) items.append([text, wav_file, "MCV_" + speaker_name]) return items -def libri_tts(root_path, meta_files=None): +def libri_tts(root_path, meta_files=None, ununsed_speakers=None): """https://ai.google/tools/datasets/libri-tts/""" items = [] if meta_files is None: @@ -222,13 +230,17 @@ def libri_tts(root_path, meta_files=None): _root_path = os.path.join(root_path, f"{speaker_name}/{chapter_id}") wav_file = os.path.join(_root_path, file_name + ".wav") text = cols[2] + # ignore speakers + if isinstance(ununsed_speakers, list): + if speaker_name in ununsed_speakers: + continue items.append([text, wav_file, "LTTS_" + speaker_name]) for item in items: assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}" return items -def custom_turkish(root_path, meta_file): +def custom_turkish(root_path, meta_file, **kwargs): txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "turkish-female" @@ -247,7 +259,7 @@ def custom_turkish(root_path, meta_file): # ToDo: add the dataset link when the dataset is released publicly -def brspeech(root_path, meta_file): +def brspeech(root_path, meta_file, ununsed_speakers=None): """BRSpeech 3.0 beta""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -258,21 +270,25 @@ def brspeech(root_path, meta_file): cols = line.split("|") wav_file = os.path.join(root_path, cols[0]) text = cols[2] - speaker_name = cols[3] - items.append([text, wav_file, speaker_name]) + speaker_id = cols[3] + # ignore speakers + if isinstance(ununsed_speakers, list): + if speaker_id in ununsed_speakers: + continue + items.append([text, wav_file, speaker_id]) return items -def vctk(root_path, meta_files=None, wavs_path="wav48"): +def vctk(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" - test_speakers = meta_files items = [] meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) for meta_file in meta_files: _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) file_id = txt_file.split(".")[0] - if isinstance(test_speakers, list): # if is list ignore this speakers ids - if speaker_id in test_speakers: + # ignore speakers + if isinstance(ununsed_speakers, list): + if speaker_id in ununsed_speakers: continue with open(meta_file, "r", encoding="utf-8") as file_text: text = file_text.readlines()[0] @@ -282,15 +298,16 @@ def vctk(root_path, meta_files=None, wavs_path="wav48"): return items -def vctk_slim(root_path, meta_files=None, wavs_path="wav48"): +def vctk_slim(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" items = [] txt_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) for text_file in txt_files: _, speaker_id, txt_file = os.path.relpath(text_file, root_path).split(os.sep) file_id = txt_file.split(".")[0] - if isinstance(meta_files, list): # if is list ignore this speakers ids - if speaker_id in meta_files: + # ignore speakers + if isinstance(ununsed_speakers, list): + if speaker_id in ununsed_speakers: continue wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav") items.append([None, wav_file, "VCTK_" + speaker_id]) @@ -298,7 +315,7 @@ def vctk_slim(root_path, meta_files=None, wavs_path="wav48"): return items -def mls(root_path, meta_files=None): +def mls(root_path, meta_files=None, ununsed_speakers=None): """http://www.openslr.org/94/""" items = [] with open(os.path.join(root_path, meta_files), "r", encoding="utf-8") as meta: @@ -307,19 +324,23 @@ def mls(root_path, meta_files=None): text = text[:-1] speaker, book, *_ = file.split("_") wav_file = os.path.join(root_path, os.path.dirname(meta_files), "audio", speaker, book, file + ".wav") + # ignore speakers + if isinstance(ununsed_speakers, list): + if speaker in ununsed_speakers: + continue items.append([text, wav_file, "MLS_" + speaker]) return items # ======================================== VOX CELEB =========================================== -def voxceleb2(root_path, meta_file=None): +def voxceleb2(root_path, meta_file=None, **kwargs): """ :param meta_file Used only for consistency with load_tts_samples api """ return _voxcel_x(root_path, meta_file, voxcel_idx="2") -def voxceleb1(root_path, meta_file=None): +def voxceleb1(root_path, meta_file=None, **kwargs): """ :param meta_file Used only for consistency with load_tts_samples api """ @@ -361,7 +382,7 @@ def _voxcel_x(root_path, meta_file, voxcel_idx): return [x.strip().split("|") for x in f.readlines()] -def baker(root_path: str, meta_file: str) -> List[List[str]]: +def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]: """Normalizes the Baker meta data file to TTS format Args: @@ -381,7 +402,7 @@ def baker(root_path: str, meta_file: str) -> List[List[str]]: return items -def kokoro(root_path, meta_file): +def kokoro(root_path, meta_file, **kwargs): """Japanese single-speaker dataset from https://github.com/kaiidams/Kokoro-Speech-Dataset""" txt_file = os.path.join(root_path, meta_file) items = [] From 1efcccd5c958029381048c8fd5c17fc21bc2c354 Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 13 Aug 2021 00:49:38 -0300 Subject: [PATCH 004/220] Implement training support with d_vecs in the VITS model --- TTS/tts/models/vits.py | 9 +++++++-- TTS/tts/utils/speakers.py | 16 +++++++++++----- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index ae607c47..73dfbb2e 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -8,6 +8,7 @@ import torch from coqpit import Coqpit from torch import nn from torch.cuda.amp.autocast_mode import autocast +from torch.nn import functional as F from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor from TTS.tts.layers.vits.discriminator import VitsDiscriminator @@ -138,6 +139,9 @@ class VitsArgs(Coqpit): use_d_vector_file (bool): Enable/Disable the use of d-vectors for multi-speaker training. Defaults to False. + d_vector_file (str): + Path to the file including pre-computed speaker embeddings. Defaults to None. + d_vector_dim (int): Number of d-vector channels. Defaults to 0. @@ -179,6 +183,7 @@ class VitsArgs(Coqpit): use_speaker_embedding: bool = False num_speakers: int = 0 speakers_file: str = None + d_vector_file: str = None speaker_embedding_channels: int = 256 use_d_vector_file: bool = False d_vector_file: str = None @@ -360,7 +365,7 @@ class Vits(BaseTTS): if sid.ndim == 0: sid = sid.unsqueeze_(0) if "d_vectors" in aux_input and aux_input["d_vectors"] is not None: - g = aux_input["d_vectors"] + g = F.normalize(aux_input["d_vectors"]).unsqueeze(-1) return sid, g def get_aux_input(self, aux_input: Dict): @@ -400,7 +405,7 @@ class Vits(BaseTTS): x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths) # speaker embedding - if self.num_speakers > 1 and sid is not None: + if self.num_speakers > 1 and sid is not None and not self.use_d_vector: g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] # posterior encoder diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index ae001155..3d8590cc 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -154,15 +154,21 @@ class SpeakerManager: """ self._save_json(file_path, self.d_vectors) - def set_d_vectors_from_file(self, file_path: str) -> None: + def set_d_vectors_from_file(self, file_path: str, data: List = None) -> None: """Load d_vectors from a json file. Args: file_path (str): Path to the target json file. """ self.d_vectors = self._load_json(file_path) - speakers = sorted({x["name"] for x in self.d_vectors.values()}) - self.speaker_ids = {name: i for i, name in enumerate(speakers)} + + # load speakers from data, because during the training we can just use some speakers from d_vector_file + if data is not None: + self.speaker_ids, _ = self.parse_speakers_from_data(data) + else: + speakers = sorted({x["name"] for x in self.d_vectors.values()}) + self.speaker_ids = {name: i for i, name in enumerate(speakers)} + self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys()))) def get_d_vector_by_clip(self, clip_idx: str) -> List: @@ -357,7 +363,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.d_vector_file" ) speaker_manager.load_d_vectors_file(c.d_vector_file) - speaker_manager.set_d_vectors_from_file(speakers_file) + speaker_manager.set_d_vectors_from_file(speakers_file, data=data) elif not c.use_d_vector_file: # restor speaker manager with speaker ID file. speaker_ids_from_data = speaker_manager.speaker_ids speaker_manager.set_speaker_ids_from_file(speakers_file) @@ -366,7 +372,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, ), " [!] You cannot introduce new speakers to a pre-trained model." elif c.use_d_vector_file and c.d_vector_file: # new speaker manager with external speaker embeddings. - speaker_manager.set_d_vectors_from_file(c.d_vector_file) + speaker_manager.set_d_vectors_from_file(c.d_vector_file, data=data) elif c.use_d_vector_file and not c.d_vector_file: raise "use_d_vector_file is True, so you need pass a external speaker embedding file." elif c.use_speaker_embedding and "speakers_file" in c and c.speakers_file: From c9f5838bb4c1c1ebed962e4090308d1a9086e1e4 Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 13 Aug 2021 03:38:54 -0300 Subject: [PATCH 005/220] Fix pylint issues --- TTS/tts/datasets/formatters.py | 30 +++++++++++++++--------------- TTS/tts/models/vits.py | 2 +- TTS/tts/utils/speakers.py | 7 ++++--- requirements.txt | 2 +- 4 files changed, 21 insertions(+), 20 deletions(-) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index bcbbb369..51ad892a 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -12,7 +12,7 @@ from tqdm import tqdm ######################## -def tweb(root_path, meta_file, **kwargs): +def tweb(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Normalize TWEB dataset. https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset """ @@ -28,7 +28,7 @@ def tweb(root_path, meta_file, **kwargs): return items -def mozilla(root_path, meta_file, **kwargs): +def mozilla(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Normalizes Mozilla meta data files to TTS format""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -43,7 +43,7 @@ def mozilla(root_path, meta_file, **kwargs): return items -def mozilla_de(root_path, meta_file, **kwargs): +def mozilla_de(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Normalizes Mozilla meta data files to TTS format""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -102,7 +102,7 @@ def mailabs(root_path, meta_files=None): return items -def ljspeech(root_path, meta_file, **kwargs): +def ljspeech(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Normalizes the LJSpeech meta data file to TTS format https://keithito.com/LJ-Speech-Dataset/""" txt_file = os.path.join(root_path, meta_file) @@ -117,7 +117,7 @@ def ljspeech(root_path, meta_file, **kwargs): return items -def ljspeech_test(root_path, meta_file, **kwargs): +def ljspeech_test(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Normalizes the LJSpeech meta data file for TTS testing https://keithito.com/LJ-Speech-Dataset/""" txt_file = os.path.join(root_path, meta_file) @@ -131,7 +131,7 @@ def ljspeech_test(root_path, meta_file, **kwargs): return items -def sam_accenture(root_path, meta_file, **kwargs): +def sam_accenture(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Normalizes the sam-accenture meta data file to TTS format https://github.com/Sam-Accenture-Non-Binary-Voice/non-binary-voice-files""" xml_file = os.path.join(root_path, "voice_over_recordings", meta_file) @@ -148,7 +148,7 @@ def sam_accenture(root_path, meta_file, **kwargs): return items -def ruslan(root_path, meta_file, **kwargs): +def ruslan(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Normalizes the RUSLAN meta data file to TTS format https://ruslan-corpus.github.io/""" txt_file = os.path.join(root_path, meta_file) @@ -163,7 +163,7 @@ def ruslan(root_path, meta_file, **kwargs): return items -def css10(root_path, meta_file, **kwargs): +def css10(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Normalizes the CSS10 dataset file to TTS format""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -177,7 +177,7 @@ def css10(root_path, meta_file, **kwargs): return items -def nancy(root_path, meta_file, **kwargs): +def nancy(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Normalizes the Nancy meta data file to TTS format""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -240,7 +240,7 @@ def libri_tts(root_path, meta_files=None, ununsed_speakers=None): return items -def custom_turkish(root_path, meta_file, **kwargs): +def custom_turkish(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "turkish-female" @@ -298,7 +298,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): return items -def vctk_slim(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): +def vctk_slim(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): # pylint: disable=unused-argument """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" items = [] txt_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) @@ -333,14 +333,14 @@ def mls(root_path, meta_files=None, ununsed_speakers=None): # ======================================== VOX CELEB =========================================== -def voxceleb2(root_path, meta_file=None, **kwargs): +def voxceleb2(root_path, meta_file=None, **kwargs): # pylint: disable=unused-argument """ :param meta_file Used only for consistency with load_tts_samples api """ return _voxcel_x(root_path, meta_file, voxcel_idx="2") -def voxceleb1(root_path, meta_file=None, **kwargs): +def voxceleb1(root_path, meta_file=None, **kwargs): # pylint: disable=unused-argument """ :param meta_file Used only for consistency with load_tts_samples api """ @@ -382,7 +382,7 @@ def _voxcel_x(root_path, meta_file, voxcel_idx): return [x.strip().split("|") for x in f.readlines()] -def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]: +def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]: # pylint: disable=unused-argument """Normalizes the Baker meta data file to TTS format Args: @@ -402,7 +402,7 @@ def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]: return items -def kokoro(root_path, meta_file, **kwargs): +def kokoro(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Japanese single-speaker dataset from https://github.com/kaiidams/Kokoro-Speech-Dataset""" txt_file = os.path.join(root_path, meta_file) items = [] diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 73dfbb2e..417b6386 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -644,7 +644,7 @@ class Vits(BaseTTS): return self._log(ap, batch, outputs, "eval") @torch.no_grad() - def test_run(self, ap, eval_loader=None) -> Tuple[Dict, Dict]: + def test_run(self, ap) -> Tuple[Dict, Dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 3d8590cc..5d883fd0 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -218,10 +218,11 @@ class SpeakerManager: def get_random_speaker_aux_input(self) -> Dict: if self.d_vectors: return {"speaker_id": None, "style_wav": None, "d_vector": self.d_vectors[random.choices(list(self.d_vectors.keys()))[0]]["embedding"]} - elif self.speaker_ids: + + if self.speaker_ids: return {"speaker_id": self.speaker_ids[random.choices(list(self.speaker_ids.keys()))[0]], "style_wav": None, "d_vector": None} - else: - return {"speaker_id": None, "style_wav": None, "d_vector": None} + + return {"speaker_id": None, "style_wav": None, "d_vector": None} def get_speakers(self) -> List: return self.speaker_ids diff --git a/requirements.txt b/requirements.txt index d21a972f..3ec33ceb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,7 @@ pyyaml scipy>=0.19.0 soundfile tensorboardX -torch>=1.9.0 +torch>=1.7 tqdm numba==0.53 umap-learn==0.5.1 From 829ee55b04890f94e2a71d2e62b405da61db228d Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 13 Aug 2021 19:58:56 -0300 Subject: [PATCH 006/220] Implement multilingual dataloader support --- TTS/config/shared_configs.py | 3 +++ TTS/trainer.py | 14 ++++++++++++++ TTS/tts/datasets/__init__.py | 6 ++++++ TTS/tts/datasets/dataset.py | 23 +++++++++++++++++++---- TTS/tts/models/base_tts.py | 31 ++++++++++++++++++++++++++----- TTS/tts/utils/text/cleaners.py | 8 ++++++++ 6 files changed, 76 insertions(+), 9 deletions(-) diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index d91bf2b6..f1ea2e0f 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -199,6 +199,7 @@ class BaseDatasetConfig(Coqpit): path: str = "" meta_file_train: str = "" ununsed_speakers: List[str] = None + language: str = "" meta_file_val: str = "" meta_file_attn_mask: str = "" @@ -335,6 +336,8 @@ class BaseTrainingConfig(Coqpit): num_loader_workers: int = 0 num_eval_loader_workers: int = 0 use_noise_augment: bool = False + use_language_weighted_sampler: bool = False + # paths output_path: str = None # distributed diff --git a/TTS/trainer.py b/TTS/trainer.py index 2a2cfc46..2175875c 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -260,6 +260,20 @@ class Trainer: else: self.run_get_model(self.config, get_model) + if hasattr(self.model, "init_multilingual"): + self.model.init_multilingual(self.config, self.data_train + self.data_eval) + config = self.config.model_args if hasattr(self.config, "model_args") else self.config + # save speakers json + if config.use_language_embedding and self.model.language_manager.num_languages > 1: + self.model.language_manager.save_language_ids_to_file(os.path.join(self.output_path, "language_ids.json")) + if hasattr(self.config, "model_args"): + self.config.model_args["num_languages"] = self.model.language_manager.num_languages + else: + self.config.num_languages = self.model.language_manager.num_languages + + # update config file + copy_model_files(self.config, self.output_path, None) + # setup criterion self.criterion = self.get_criterion(self.model) diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index 741f92fd..3673e188 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -68,16 +68,22 @@ def load_tts_samples( meta_file_train = dataset["meta_file_train"] meta_file_val = dataset["meta_file_val"] ununsed_speakers = dataset["ununsed_speakers"] + language = dataset["language"] + # setup the right data processor if formatter is None: formatter = _get_formatter_by_name(name) # load train set meta_data_train = formatter(root_path, meta_file_train, ununsed_speakers=ununsed_speakers) + # TODO: remove the loops and pass language as a parameter to preprocessor for faster load + meta_data_train = [[*item, language] for item in meta_data_train] + print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}") # load evaluation split if set if eval_split: if meta_file_val: meta_data_eval = formatter(root_path, meta_file_val, ununsed_speakers=ununsed_speakers) + meta_data_eval = [[*item, language] for item in meta_data_eval] else: meta_data_eval, meta_data_train = split_dataset(meta_data_train) meta_data_eval_all += meta_data_eval diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 04314bab..7ba97eba 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -37,6 +37,7 @@ class TTSDataset(Dataset): enable_eos_bos: bool = False, speaker_id_mapping: Dict = None, d_vector_mapping: Dict = None, + language_id_mapping: Dict = None, use_noise_augment: bool = False, verbose: bool = False, ): @@ -122,6 +123,7 @@ class TTSDataset(Dataset): self.enable_eos_bos = enable_eos_bos self.speaker_id_mapping = speaker_id_mapping self.d_vector_mapping = d_vector_mapping + self.language_id_mapping = language_id_mapping self.use_noise_augment = use_noise_augment self.verbose = verbose self.input_seq_computed = False @@ -197,10 +199,10 @@ class TTSDataset(Dataset): def load_data(self, idx): item = self.items[idx] - if len(item) == 4: - text, wav_file, speaker_name, attn_file = item + if len(item) == 5: + text, wav_file, speaker_name, language_name, attn_file = item else: - text, wav_file, speaker_name = item + text, wav_file, speaker_name, language_name = item attn = None raw_text = text @@ -218,7 +220,7 @@ class TTSDataset(Dataset): self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, - self.phoneme_language, + language_name if language_name else self.phoneme_language, self.custom_symbols, self.characters, self.add_blank, @@ -260,6 +262,7 @@ class TTSDataset(Dataset): "attn": attn, "item_idx": self.items[idx][1], "speaker_name": speaker_name, + "language_name": language_name, "wav_file_name": os.path.basename(wav_file), } return sample @@ -413,6 +416,14 @@ class TTSDataset(Dataset): # convert list of dicts to dict of lists batch = {k: [dic[k] for dic in batch] for k in batch[0]} + speaker_names = [batch[idx]["speaker_name"] for idx in ids_sorted_decreasing] + + # get language ids from language names + if self.language_id_mapping is not None: + language_names = [batch[idx]["language_name"] for idx in ids_sorted_decreasing] + language_ids = [self.language_id_mapping[ln] for ln in language_names] + else: + language_ids = None # get pre-computed d-vectors if self.d_vector_mapping is not None: wav_files_names = [batch["wav_file_name"][idx] for idx in ids_sorted_decreasing] @@ -466,6 +477,9 @@ class TTSDataset(Dataset): if speaker_ids is not None: speaker_ids = torch.LongTensor(speaker_ids) + if language_ids is not None: + language_ids = torch.LongTensor(language_ids) + # compute linear spectrogram if self.compute_linear_spec: linear = [self.ap.spectrogram(w).astype("float32") for w in batch["wav"]] @@ -528,6 +542,7 @@ class TTSDataset(Dataset): "waveform": wav_padded, "raw_text": batch["raw_text"], "pitch": pitch, + "language_ids": language_ids } raise TypeError( diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 854526de..c55936a8 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -13,6 +13,7 @@ from TTS.model import BaseModel from TTS.tts.configs.shared_configs import CharactersConfig from TTS.tts.datasets.dataset import TTSDataset from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager +from TTS.tts.utils.languages import LanguageManager, get_language_weighted_sampler from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text import make_symbols from TTS.tts.utils.visual import plot_alignment, plot_spectrogram @@ -73,9 +74,18 @@ class BaseTTS(BaseModel): def get_speaker_manager(config: Coqpit, restore_path: str, data: List, out_path: str = None) -> SpeakerManager: return get_speaker_manager(config, restore_path, data, out_path) - def init_multispeaker(self, config: Coqpit): - """Init speaker embedding layer if `use_speaker_embedding` is True and set the expected speaker embedding - vector dimension in the network. If model uses d-vectors, then it only sets the expected dimension. + def init_multispeaker(self, config: Coqpit, data: List = None): + """Initialize a speaker embedding layer if needen and define expected embedding channel size for defining + `in_channels` size of the connected layers. + + This implementation yields 3 possible outcomes: + + 1. If `config.use_speaker_embedding` and `config.use_d_vector_file are False, do nothing. + 2. If `config.use_d_vector_file` is True, set expected embedding channel size to `config.d_vector_dim` or 512. + 3. If `config.use_speaker_embedding`, initialize a speaker embedding layer with channel size of + `config.d_vector_dim` or 512. + + You can override this function for new models. Args: config (Coqpit): Model configuration. @@ -122,6 +132,7 @@ class BaseTTS(BaseModel): attn_mask = batch["attns"] waveform = batch["waveform"] pitch = batch["pitch"] + language_ids = batch["language_ids"] max_text_length = torch.max(text_lengths.float()) max_spec_length = torch.max(mel_lengths.float()) @@ -169,6 +180,7 @@ class BaseTTS(BaseModel): "item_idx": item_idx, "waveform": waveform, "pitch": pitch, + "language_ids": language_ids, } def get_data_loader( @@ -199,7 +211,12 @@ class BaseTTS(BaseModel): if hasattr(self, "make_symbols"): custom_symbols = self.make_symbols(self.config) - # init dataset + if hasattr(self, "language_manager"): + language_id_mapping = self.language_manager.language_id_mapping if self.args.use_language_embedding else None + else: + language_id_mapping = None + + # init dataloader dataset = TTSDataset( outputs_per_step=config.r if "r" in config else 1, text_cleaner=config.text_cleaner, @@ -223,6 +240,7 @@ class BaseTTS(BaseModel): verbose=verbose, speaker_id_mapping=speaker_id_mapping, d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None, + language_id_mapping=language_id_mapping, ) # pre-compute phonemes @@ -267,8 +285,11 @@ class BaseTTS(BaseModel): # sampler for DDP sampler = DistributedSampler(dataset) if num_gpus > 1 else None + if sampler is None: + if getattr(config, "use_language_weighted_sampler", False): + sampler = get_language_weighted_sampler(dataset.items) + print(" > Using Language weighted sampler") - # init dataloader loader = DataLoader( dataset, batch_size=config.eval_batch_size if is_eval else config.batch_size, diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 4b041ed8..71155ebc 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -135,3 +135,11 @@ def phoneme_cleaners(text): text = remove_aux_symbols(text) text = collapse_whitespace(text) return text + +def multilingual_cleaners(text): + '''Pipeline for multilingual text''' + text = lowercase(text) + text = replace_symbols(text, lang=None) + text = remove_aux_symbols(text) + text = collapse_whitespace(text) + return text \ No newline at end of file From d0e3647db62297f385f9def65e320919e020a4e9 Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 13 Aug 2021 21:40:34 -0300 Subject: [PATCH 007/220] Add multilingual training support to the VITS model --- TTS/tts/layers/glow_tts/duration_predictor.py | 11 +- TTS/tts/layers/vits/networks.py | 12 +- .../vits/stochastic_duration_predictor.py | 11 +- TTS/tts/models/base_tts.py | 3 +- TTS/tts/models/vits.py | 87 +++++++++-- TTS/tts/utils/languages.py | 138 ++++++++++++++++++ TTS/tts/utils/speakers.py | 11 +- 7 files changed, 248 insertions(+), 25 deletions(-) create mode 100644 TTS/tts/utils/languages.py diff --git a/TTS/tts/layers/glow_tts/duration_predictor.py b/TTS/tts/layers/glow_tts/duration_predictor.py index 2c0303be..f46c73a9 100644 --- a/TTS/tts/layers/glow_tts/duration_predictor.py +++ b/TTS/tts/layers/glow_tts/duration_predictor.py @@ -18,7 +18,7 @@ class DurationPredictor(nn.Module): dropout_p (float): Dropout rate used after each conv layer. """ - def __init__(self, in_channels, hidden_channels, kernel_size, dropout_p, cond_channels=None): + def __init__(self, in_channels, hidden_channels, kernel_size, dropout_p, cond_channels=None, language_emb_dim=None): super().__init__() # class arguments self.in_channels = in_channels @@ -36,7 +36,10 @@ class DurationPredictor(nn.Module): if cond_channels is not None and cond_channels != 0: self.cond = nn.Conv1d(cond_channels, in_channels, 1) - def forward(self, x, x_mask, g=None): + if language_emb_dim != 0 and language_emb_dim is not None: + self.cond_lang = nn.Conv1d(language_emb_dim, in_channels, 1) + + def forward(self, x, x_mask, g=None, lang_emb=None): """ Shapes: - x: :math:`[B, C, T]` @@ -45,6 +48,10 @@ class DurationPredictor(nn.Module): """ if g is not None: x = x + self.cond(g) + + if lang_emb is not None: + x = x + self.cond_lang(lang_emb) + x = self.conv_1(x * x_mask) x = torch.relu(x) x = self.norm_1(x) diff --git a/TTS/tts/layers/vits/networks.py b/TTS/tts/layers/vits/networks.py index cfc8b6ac..ef426ace 100644 --- a/TTS/tts/layers/vits/networks.py +++ b/TTS/tts/layers/vits/networks.py @@ -37,6 +37,7 @@ class TextEncoder(nn.Module): num_layers: int, kernel_size: int, dropout_p: float, + language_emb_dim: int = None, ): """Text Encoder for VITS model. @@ -55,8 +56,12 @@ class TextEncoder(nn.Module): self.hidden_channels = hidden_channels self.emb = nn.Embedding(n_vocab, hidden_channels) + nn.init.normal_(self.emb.weight, 0.0, hidden_channels ** -0.5) + if language_emb_dim: + hidden_channels += language_emb_dim + self.encoder = RelativePositionTransformer( in_channels=hidden_channels, out_channels=hidden_channels, @@ -72,13 +77,18 @@ class TextEncoder(nn.Module): self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - def forward(self, x, x_lengths): + def forward(self, x, x_lengths, lang_emb=None): """ Shapes: - x: :math:`[B, T]` - x_length: :math:`[B]` """ x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h] + + # concat the lang emb in embedding chars + if lang_emb is not None: + x = torch.cat((x, lang_emb.transpose(2, 1).expand(x.size(0), x.size(1), -1)), dim=-1) + x = torch.transpose(x, 1, -1) # [b, h, t] x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) diff --git a/TTS/tts/layers/vits/stochastic_duration_predictor.py b/TTS/tts/layers/vits/stochastic_duration_predictor.py index 91e53da3..8ec7c866 100644 --- a/TTS/tts/layers/vits/stochastic_duration_predictor.py +++ b/TTS/tts/layers/vits/stochastic_duration_predictor.py @@ -178,7 +178,7 @@ class StochasticDurationPredictor(nn.Module): """ def __init__( - self, in_channels: int, hidden_channels: int, kernel_size: int, dropout_p: float, num_flows=4, cond_channels=0 + self, in_channels: int, hidden_channels: int, kernel_size: int, dropout_p: float, num_flows=4, cond_channels=0, language_emb_dim=None ): super().__init__() @@ -205,7 +205,10 @@ class StochasticDurationPredictor(nn.Module): if cond_channels != 0 and cond_channels is not None: self.cond = nn.Conv1d(cond_channels, hidden_channels, 1) - def forward(self, x, x_mask, dr=None, g=None, reverse=False, noise_scale=1.0): + if language_emb_dim != 0 and language_emb_dim is not None: + self.cond_lang = nn.Conv1d(language_emb_dim, hidden_channels, 1) + + def forward(self, x, x_mask, dr=None, g=None, lang_emb=None, reverse=False, noise_scale=1.0): """ Shapes: - x: :math:`[B, C, T]` @@ -217,6 +220,10 @@ class StochasticDurationPredictor(nn.Module): x = self.pre(x) if g is not None: x = x + self.cond(g) + + if lang_emb is not None: + x = x + self.cond_lang(lang_emb) + x = self.convs(x, x_mask) x = self.proj(x) * x_mask diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index c55936a8..c0d2bd78 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -287,8 +287,9 @@ class BaseTTS(BaseModel): sampler = DistributedSampler(dataset) if num_gpus > 1 else None if sampler is None: if getattr(config, "use_language_weighted_sampler", False): - sampler = get_language_weighted_sampler(dataset.items) print(" > Using Language weighted sampler") + sampler = get_language_weighted_sampler(dataset.items) + loader = DataLoader( dataset, diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 417b6386..3a682ce5 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -17,6 +17,7 @@ from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDuration from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.languages import LanguageManager from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.visual import plot_alignment from TTS.utils.trainer_utils import get_optimizer, get_scheduler @@ -189,6 +190,9 @@ class VitsArgs(Coqpit): d_vector_file: str = None d_vector_dim: int = 0 detach_dp_input: bool = True + use_language_embedding: bool = False + embedded_language_dim: int = 4 + num_languages: int = 0 class Vits(BaseTTS): @@ -247,6 +251,7 @@ class Vits(BaseTTS): self.args = args self.init_multispeaker(config) + self.init_multilingual(config) self.length_scale = args.length_scale self.noise_scale = args.noise_scale @@ -265,6 +270,7 @@ class Vits(BaseTTS): args.num_layers_text_encoder, args.kernel_size_text_encoder, args.dropout_p_text_encoder, + language_emb_dim=self.embedded_language_dim ) self.posterior_encoder = PosteriorEncoder( @@ -288,16 +294,22 @@ class Vits(BaseTTS): if args.use_sdp: self.duration_predictor = StochasticDurationPredictor( - args.hidden_channels, + args.hidden_channels + self.embedded_language_dim, 192, 3, args.dropout_p_duration_predictor, 4, cond_channels=self.embedded_speaker_dim, + language_emb_dim=self.embedded_language_dim, ) else: self.duration_predictor = DurationPredictor( - args.hidden_channels, 256, 3, args.dropout_p_duration_predictor, cond_channels=self.embedded_speaker_dim + args.hidden_channels + self.embedded_language_dim, + 256, + 3, + args.dropout_p_duration_predictor, + cond_channels=self.embedded_speaker_dim, + language_emb_dim=self.embedded_language_dim, ) self.waveform_decoder = HifiganGenerator( @@ -356,17 +368,40 @@ class Vits(BaseTTS): self.speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file) self.embedded_speaker_dim = config.d_vector_dim + def init_multilingual(self, config: Coqpit, data: List = None): + """Initialize multilingual modules of a model. + + Args: + config (Coqpit): Model configuration. + data (List, optional): Dataset items to infer number of speakers. Defaults to None. + """ + if hasattr(config, "model_args"): + config = config.model_args + # init language manager + self.language_manager = LanguageManager(config, data=data) + + # init language embedding layer + if config.use_language_embedding: + self.embedded_language_dim = config.embedded_language_dim + self.emb_l = nn.Embedding(self.language_manager.num_languages, self.embedded_language_dim) + torch.nn.init.xavier_uniform_(self.emb_l.weight) + else: + self.embedded_language_dim = 0 + self.emb_l = None + @staticmethod def _set_cond_input(aux_input: Dict): """Set the speaker conditioning input based on the multi-speaker mode.""" - sid, g = None, None + sid, g, lid = None, None, None if "speaker_ids" in aux_input and aux_input["speaker_ids"] is not None: sid = aux_input["speaker_ids"] if sid.ndim == 0: sid = sid.unsqueeze_(0) if "d_vectors" in aux_input and aux_input["d_vectors"] is not None: g = F.normalize(aux_input["d_vectors"]).unsqueeze(-1) - return sid, g + if "language_ids" in aux_input and aux_input["language_ids"] is not None: + lid = aux_input["language_ids"] + return sid, g, lid def get_aux_input(self, aux_input: Dict): sid, g = self._set_cond_input(aux_input) @@ -378,7 +413,7 @@ class Vits(BaseTTS): x_lengths: torch.tensor, y: torch.tensor, y_lengths: torch.tensor, - aux_input={"d_vectors": None, "speaker_ids": None}, + aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}, ) -> Dict: """Forward pass of the model. @@ -401,13 +436,19 @@ class Vits(BaseTTS): - speaker_ids: :math:`[B]` """ outputs = {} - sid, g = self._set_cond_input(aux_input) - x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths) + sid, g, lid = self._set_cond_input(aux_input) # speaker embedding if self.num_speakers > 1 and sid is not None and not self.use_d_vector: g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] + # language embedding + if self.args.use_language_embedding: + lang_emb = self.emb_l(lid).unsqueeze(-1) + + x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb) + + # posterior encoder z, m_q, logs_q, y_mask = self.posterior_encoder(y, y_lengths, g=g) @@ -433,6 +474,7 @@ class Vits(BaseTTS): x_mask, attn_durations, g=g.detach() if self.args.detach_dp_input and g is not None else g, + lang_emb=lang_emb.detach() if self.args.detach_dp_input and lang_emb is not None else lang_emb, ) loss_duration = loss_duration / torch.sum(x_mask) else: @@ -441,6 +483,7 @@ class Vits(BaseTTS): x.detach() if self.args.detach_dp_input else x, x_mask, g=g.detach() if self.args.detach_dp_input and g is not None else g, + lang_emb=lang_emb.detach() if self.args.detach_dp_input and lang_emb is not None else lang_emb, ) loss_duration = torch.sum((log_durations - attn_log_durations) ** 2, [1, 2]) / torch.sum(x_mask) outputs["loss_duration"] = loss_duration @@ -467,25 +510,30 @@ class Vits(BaseTTS): ) return outputs - def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None}): + def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}): """ Shapes: - x: :math:`[B, T_seq]` - d_vectors: :math:`[B, C, 1]` - speaker_ids: :math:`[B]` """ - sid, g = self._set_cond_input(aux_input) + sid, g, lid = self._set_cond_input(aux_input) x_lengths = torch.tensor(x.shape[1:2]).to(x.device) - x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths) - - if self.num_speakers > 0 and sid is not None: + # speaker embedding + if self.num_speakers > 0 and sid: g = self.emb_g(sid).unsqueeze(-1) + # language embedding + if self.args.use_language_embedding: + lang_emb = self.emb_l(lid).unsqueeze(-1) + + x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb) + if self.args.use_sdp: - logw = self.duration_predictor(x, x_mask, g=g, reverse=True, noise_scale=self.inference_noise_scale_dp) + logw = self.duration_predictor(x, x_mask, g=g, reverse=True, noise_scale=self.inference_noise_scale_dp, lang_emb=lang_emb) else: - logw = self.duration_predictor(x, x_mask, g=g) + logw = self.duration_predictor(x, x_mask, g=g, lang_emb=lang_emb) w = torch.exp(logw) * x_mask * self.length_scale w_ceil = torch.ceil(w) @@ -537,6 +585,7 @@ class Vits(BaseTTS): linear_input = batch["linear_input"] d_vectors = batch["d_vectors"] speaker_ids = batch["speaker_ids"] + language_ids = batch["language_ids"] waveform = batch["waveform"] # generator pass @@ -545,7 +594,7 @@ class Vits(BaseTTS): text_lengths, linear_input.transpose(1, 2), mel_lengths, - aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids}, + aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids}, ) # cache tensors for the discriminator @@ -581,6 +630,14 @@ class Vits(BaseTTS): loss_duration=outputs["loss_duration"], ) + # handle the duration loss + if self.args.use_sdp: + loss_dict["nll_duration"] = outputs["nll_duration"] + loss_dict["loss"] += outputs["nll_duration"] + else: + loss_dict["loss_duration"] = outputs["loss_duration"] + loss_dict["loss"] += outputs["loss_duration"] + elif optimizer_idx == 1: # discriminator pass outputs = {} diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py new file mode 100644 index 00000000..b87b9936 --- /dev/null +++ b/TTS/tts/utils/languages.py @@ -0,0 +1,138 @@ +import os +import json +import torch +import fsspec +import numpy as np +from typing import Dict, Tuple, List +from coqpit import Coqpit + +from torch.utils.data.sampler import WeightedRandomSampler + +class LanguageManager: + """Manage the languages for multi-lingual 🐸TTS models. Load a datafile and parse the information + in a way that can be queried by language. + + Args: + language_id_file_path (str, optional): Path to the metafile that maps language names to ids used by + TTS models. Defaults to "". + + Examples: + >>> manager = LanguageManager(language_id_file_path=language_id_file_path) + >>> language_id_mapper = manager.language_ids + """ + num_languages: int = 0 + language_id_mapping: Dict = {} + def __init__( + self, + language_id_file_path: str = "", + ): + if language_id_file_path: + self.set_language_ids_from_file(language_id_file_path) + + @staticmethod + def _load_json(json_file_path: str) -> Dict: + with fsspec.open(json_file_path, "r") as f: + return json.load(f) + + @staticmethod + def _save_json(json_file_path: str, data: dict) -> None: + with fsspec.open(json_file_path, "w") as f: + json.dump(data, f, indent=4) + + @property + def num_languages(self) -> int: + return len(list(self.language_id_mapping.keys())) + + @property + def language_names(self) -> List: + return list(self.language_id_mapping.keys()) + + @staticmethod + def parse_languages_from_data(items: list) -> Tuple[Dict, int]: + """Parse language IDs from data samples retured by `load_meta_data()`. + + Args: + items (list): Data sampled returned by `load_meta_data()`. + + Returns: + Tuple[Dict, int]: language IDs and number of languages. + """ + languages = sorted({item[3] for item in items}) + language_ids = {name: i for i, name in enumerate(languages)} + num_languages = len(language_ids) + return language_ids, num_languages + + def set_language_ids_from_data(self, items: List) -> None: + """Set language IDs from data samples. + + Args: + items (List): Data sampled returned by `load_meta_data()`. + """ + self.language_id_mapping, _ = self.parse_languages_from_data(items) + + def set_language_ids_from_file(self, file_path: str) -> None: + """Load language ids from a json file. + + Args: + file_path (str): Path to the target json file. + """ + self.language_id_mapping = self._load_json(file_path) + self.num_languages = len(self.language_id_mapping) + + def save_language_ids_to_file(self, file_path: str) -> None: + """Save language IDs to a json file. + + Args: + file_path (str): Path to the output file. + """ + self._save_json(file_path, self.language_id_mapping) + +def _set_file_path(path): + """Find the language_ids.json under the given path or the above it. + Intended to band aid the different paths returned in restored and continued training.""" + path_restore = os.path.join(os.path.dirname(path), "language_ids.json") + path_continue = os.path.join(path, "language_ids.json") + fs = fsspec.get_mapper(path).fs + if fs.exists(path_restore): + return path_restore + if fs.exists(path_continue): + return path_continue + return None + +def get_language_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path: str = None) -> LanguageManager: + """Initiate a `LanguageManager` instance by the provided config. + + Args: + c (Coqpit): Model configuration. + restore_path (str): Path to a previous training folder. + data (List): Data sampled returned by `load_meta_data()`. Defaults to None. + out_path (str, optional): Save the generated language IDs to a output path. Defaults to None. + + Returns: + SpeakerManager: initialized and ready to use instance. + """ + language_manager = LanguageManager() + if c.use_language_embedding: + if data is not None: + language_manager.set_language_ids_from_data(data) + if restore_path: + language_file = _set_file_path(restore_path) + # restoring language manager from a previous run. + if language_file: + language_manager.set_language_ids_from_file(language_file) + if language_manager.num_languages > 0: + print( + " > Language manager is loaded with {} languages: {}".format( + language_manager.num_languages, ", ".join(language_manager.language_names) + ) + ) + return language_manager + +def get_language_weighted_sampler(items: list): + language_names = np.array([item[3] for item in items]) + unique_language_names = np.unique(language_names).tolist() + language_ids = [unique_language_names.index(l) for l in language_names] + language_count = np.array([len(np.where(language_names == l)[0]) for l in unique_language_names]) + weight_language = 1. / language_count + dataset_samples_weight = torch.from_numpy(np.array([weight_language[l] for l in language_ids])).double() + return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight)) \ No newline at end of file diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 5d883fd0..b7dd5251 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -379,11 +379,14 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, elif c.use_speaker_embedding and "speakers_file" in c and c.speakers_file: # new speaker manager with speaker IDs file. speaker_manager.set_speaker_ids_from_file(c.speakers_file) - print( - " > Speaker manager is loaded with {} speakers: {}".format( - speaker_manager.num_speakers, ", ".join(speaker_manager.speaker_ids) + + if speaker_manager.num_speakers > 0: + print( + " > Speaker manager is loaded with {} speakers: {}".format( + speaker_manager.num_speakers, ", ".join(speaker_manager.speaker_ids) + ) ) - ) + # save file if path is defined if out_path: out_file_path = os.path.join(out_path, "speakers.json") From 8e83a212fa91de6816909a3fb174e7978b1f2655 Mon Sep 17 00:00:00 2001 From: Edresson Date: Sat, 14 Aug 2021 17:52:00 -0300 Subject: [PATCH 008/220] Add multilingual inference support --- TTS/tts/configs/vits_config.py | 12 +++---- TTS/tts/models/base_tts.py | 45 +++++++++++++++++++++++++++ TTS/tts/models/vits.py | 57 +++++++++++++++++++--------------- TTS/tts/utils/speakers.py | 41 +++++++++++++++++++++--- TTS/tts/utils/synthesis.py | 22 ++++++++----- 5 files changed, 133 insertions(+), 44 deletions(-) diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index d490e6e6..3e031f02 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -130,13 +130,13 @@ class VitsConfig(BaseTTSConfig): add_blank: bool = True # testing - test_sentences: List[str] = field( + test_sentences: List[List] = field( default_factory=lambda: [ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963.", + ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."], + ["Be a voice, not an echo."], + ["I'm sorry Dave. I'm afraid I can't do that."], + ["This cake is great. It's so delicious and moist."], + ["Prior to November 22, 1963."], ] ) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index c0d2bd78..bfa6df14 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -107,6 +107,51 @@ class BaseTTS(BaseModel): self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) + def get_aux_input(self, **kwargs) -> Dict: + """Prepare and return `aux_input` used by `forward()`""" + return {"speaker_id": None, "style_wav": None, "d_vector": None, "language_id": None} + + def get_aux_input_from_test_setences(self, sentence_info): + if hasattr(self.config, "model_args"): + config = self.config.model_args + else: + config = self.config + + # extract speaker and language info + text, speaker_name, style_wav, language_name = None, None, None, None + + if isinstance(sentence_info, list): + if len(sentence_info) == 1: + text = sentence_info[0] + elif len(sentence_info) == 2: + text, speaker_name = sentence_info + elif len(sentence_info) == 3: + text, speaker_name, style_wav = sentence_info + elif len(sentence_info) == 4: + text, speaker_name, style_wav, language_name = sentence_info + else: + text = sentence_info + + # get speaker id/d_vector + speaker_id, d_vector, language_id = None, None, None + if hasattr(self, "speaker_manager") and config.use_speaker_embedding: + if config.use_d_vector_file: + if speaker_name is None: + d_vector = self.speaker_manager.get_random_d_vector() + else: + d_vector = self.speaker_manager.get_d_vector_by_speaker(speaker_name) + else: + if speaker_name is None: + speaker_id = self.speaker_manager.get_random_speaker_id() + else: + speaker_id = self.speaker_manager.speaker_ids[speaker_name] + + # get language id + if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None: + language_id = self.language_manager.language_id_mapping[language_name] + + return {"text": text, "speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector, "language_id": language_id} + def format_batch(self, batch: Dict) -> Dict: """Generic batch formatting for `TTSDataset`. diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 3a682ce5..11f1fab0 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -399,8 +399,14 @@ class Vits(BaseTTS): sid = sid.unsqueeze_(0) if "d_vectors" in aux_input and aux_input["d_vectors"] is not None: g = F.normalize(aux_input["d_vectors"]).unsqueeze(-1) + if g.ndim == 2: + g = g.unsqueeze_(0) + if "language_ids" in aux_input and aux_input["language_ids"] is not None: lid = aux_input["language_ids"] + if lid.ndim == 0: + lid = lid.unsqueeze_(0) + return sid, g, lid def get_aux_input(self, aux_input: Dict): @@ -437,9 +443,8 @@ class Vits(BaseTTS): """ outputs = {} sid, g, lid = self._set_cond_input(aux_input) - # speaker embedding - if self.num_speakers > 1 and sid is not None and not self.use_d_vector: + if self.args.use_speaker_embedding and sid is not None and not self.use_d_vector: g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] # language embedding @@ -521,11 +526,11 @@ class Vits(BaseTTS): x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # speaker embedding - if self.num_speakers > 0 and sid: + if self.args.use_speaker_embedding and sid is not None and not self.use_d_vector: g = self.emb_g(sid).unsqueeze(-1) # language embedding - if self.args.use_language_embedding: + if self.args.use_language_embedding and lid is not None: lang_emb = self.emb_l(lid).unsqueeze(-1) x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb) @@ -713,29 +718,29 @@ class Vits(BaseTTS): test_audios = {} test_figures = {} test_sentences = self.config.test_sentences - if hasattr(self, "speaker_manager"): - aux_inputs = self.speaker_manager.get_random_speaker_aux_input() - else: - aux_inputs = self.get_aux_input() - for idx, sen in enumerate(test_sentences): + for idx, s_info in enumerate(test_sentences): + try: + aux_inputs = self.get_aux_input_from_test_setences(s_info) + wav, alignment, _, _ = synthesis( + self, + aux_inputs["text"], + self.config, + "cuda" in str(next(self.parameters()).device), + ap, + speaker_id=aux_inputs["speaker_id"], + d_vector=aux_inputs["d_vector"], + style_wav=aux_inputs["style_wav"], + language_id=aux_inputs["language_id"], + enable_eos_bos_chars=self.config.enable_eos_bos_chars, + use_griffin_lim=True, + do_trim_silence=False, + ).values() - wav, alignment, _, _ = synthesis( - self, - sen, - self.config, - "cuda" in str(next(self.parameters()).device), - ap, - speaker_id=aux_inputs["speaker_id"], - d_vector=aux_inputs["d_vector"], - style_wav=aux_inputs["style_wav"], - enable_eos_bos_chars=self.config.enable_eos_bos_chars, - use_griffin_lim=True, - do_trim_silence=False, - ).values() - - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False) + test_audios["{}-audio".format(idx)] = wav + test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False) + except: # pylint: disable=bare-except + print(" !! Error creating Test Sentence -", idx) return test_figures, test_audios def get_optimizer(self) -> List: @@ -832,3 +837,5 @@ class Vits(BaseTTS): if eval: self.eval() assert not self.training + + diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index b7dd5251..1497ca74 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -193,6 +193,20 @@ class SpeakerManager: """ return [x["embedding"] for x in self.d_vectors.values() if x["name"] == speaker_idx] + def get_d_vector_by_speaker(self, speaker_idx: str) -> np.ndarray: + """Get a d_vector of a speaker. + + Args: + speaker_idx (str): Target speaker ID. + + Returns: + np.ndarray: d_vector. + """ + for x in self.d_vectors.values(): + if x["name"] == speaker_idx: + return x["embedding"] + return None + def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: """Get mean d_vector of a speaker ID. @@ -215,14 +229,31 @@ class SpeakerManager: d_vectors = np.stack(d_vectors[:num_samples]).mean(0) return d_vectors - def get_random_speaker_aux_input(self) -> Dict: - if self.d_vectors: - return {"speaker_id": None, "style_wav": None, "d_vector": self.d_vectors[random.choices(list(self.d_vectors.keys()))[0]]["embedding"]} + def get_random_speaker_id(self) -> Any: + """Get a random d_vector. + Args: + + Returns: + np.ndarray: d_vector. + """ if self.speaker_ids: - return {"speaker_id": self.speaker_ids[random.choices(list(self.speaker_ids.keys()))[0]], "style_wav": None, "d_vector": None} + return self.speaker_ids[random.choices(list(self.speaker_ids.keys()))[0]] - return {"speaker_id": None, "style_wav": None, "d_vector": None} + return None + + def get_random_d_vector(self) -> Any: + """Get a random D ID. + + Args: + + Returns: + np.ndarray: d_vector. + """ + if self.d_vectors: + return self.d_vectors[random.choices(list(self.d_vectors.keys()))[0]]["embedding"] + + return None def get_speakers(self) -> List: return self.speaker_ids diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 578c26c0..63fe92c3 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -71,6 +71,7 @@ def run_model_torch( speaker_id: int = None, style_mel: torch.Tensor = None, d_vector: torch.Tensor = None, + language_id: torch.Tensor = None, ) -> Dict: """Run a torch model for inference. It does not support batch inference. @@ -96,6 +97,7 @@ def run_model_torch( "speaker_ids": speaker_id, "d_vectors": d_vector, "style_mel": style_mel, + "language_ids": language_id, }, ) return outputs @@ -160,13 +162,13 @@ def inv_spectrogram(postnet_output, ap, CONFIG): return wav -def speaker_id_to_torch(speaker_id, cuda=False): - if speaker_id is not None: - speaker_id = np.asarray(speaker_id) - speaker_id = torch.from_numpy(speaker_id) +def id_to_torch(aux_id, cuda=False): + if aux_id is not None: + aux_id = np.asarray(aux_id) + aux_id = torch.from_numpy(aux_id) if cuda: - return speaker_id.cuda() - return speaker_id + return aux_id.cuda() + return aux_id def embedding_to_torch(d_vector, cuda=False): @@ -208,6 +210,7 @@ def synthesis( use_griffin_lim=False, do_trim_silence=False, d_vector=None, + language_id=None, backend="torch", ): """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to @@ -262,11 +265,14 @@ def synthesis( # pass tensors to backend if backend == "torch": if speaker_id is not None: - speaker_id = speaker_id_to_torch(speaker_id, cuda=use_cuda) + speaker_id = id_to_torch(speaker_id, cuda=use_cuda) if d_vector is not None: d_vector = embedding_to_torch(d_vector, cuda=use_cuda) + if language_id is not None: + language_id = id_to_torch(language_id, cuda=use_cuda) + if not isinstance(style_mel, dict): style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=use_cuda) @@ -278,7 +284,7 @@ def synthesis( text_inputs = tf.expand_dims(text_inputs, 0) # synthesize voice if backend == "torch": - outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, d_vector=d_vector) + outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, d_vector=d_vector, language_id=language_id) model_outputs = outputs["model_outputs"] model_outputs = model_outputs[0].data.cpu().numpy() alignments = outputs["alignments"] From 32ece5d5adff48437e42f8c52a75b6c69c42cccf Mon Sep 17 00:00:00 2001 From: Edresson Date: Sun, 15 Aug 2021 13:44:58 -0300 Subject: [PATCH 009/220] Fix pylint issues --- TTS/tts/configs/vits_config.py | 4 ++-- TTS/tts/models/vits.py | 4 +--- TTS/tts/utils/languages.py | 6 ++---- TTS/tts/utils/text/cleaners.py | 2 +- tests/data_tests/test_loader.py | 5 +++++ 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index 3e031f02..cc3e4940 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -82,8 +82,8 @@ class VitsConfig(BaseTTSConfig): add_blank (bool): If true, a blank token is added in between every character. Defaults to `True`. - test_sentences (List[str]): - List of sentences to be used for testing. + test_sentences (List[List]): + List of sentences with speaker and language information to be used for testing. Note: Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters. diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 11f1fab0..6fe60fa0 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -740,7 +740,7 @@ class Vits(BaseTTS): test_audios["{}-audio".format(idx)] = wav test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False) except: # pylint: disable=bare-except - print(" !! Error creating Test Sentence -", idx) + print(" !! Error creating Test Sentence -", idx) return test_figures, test_audios def get_optimizer(self) -> List: @@ -837,5 +837,3 @@ class Vits(BaseTTS): if eval: self.eval() assert not self.training - - diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py index b87b9936..94be914c 100644 --- a/TTS/tts/utils/languages.py +++ b/TTS/tts/utils/languages.py @@ -20,7 +20,6 @@ class LanguageManager: >>> manager = LanguageManager(language_id_file_path=language_id_file_path) >>> language_id_mapper = manager.language_ids """ - num_languages: int = 0 language_id_mapping: Dict = {} def __init__( self, @@ -77,7 +76,6 @@ class LanguageManager: file_path (str): Path to the target json file. """ self.language_id_mapping = self._load_json(file_path) - self.num_languages = len(self.language_id_mapping) def save_language_ids_to_file(self, file_path: str) -> None: """Save language IDs to a json file. @@ -99,7 +97,7 @@ def _set_file_path(path): return path_continue return None -def get_language_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path: str = None) -> LanguageManager: +def get_language_manager(c: Coqpit, data: List = None, restore_path: str = None) -> LanguageManager: """Initiate a `LanguageManager` instance by the provided config. Args: @@ -135,4 +133,4 @@ def get_language_weighted_sampler(items: list): language_count = np.array([len(np.where(language_names == l)[0]) for l in unique_language_names]) weight_language = 1. / language_count dataset_samples_weight = torch.from_numpy(np.array([weight_language[l] for l in language_ids])).double() - return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight)) \ No newline at end of file + return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight)) diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 71155ebc..826919c2 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -142,4 +142,4 @@ def multilingual_cleaners(text): text = replace_symbols(text, lang=None) text = remove_aux_symbols(text) text = collapse_whitespace(text) - return text \ No newline at end of file + return text diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index 8a20c261..19c2e8f7 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -38,6 +38,11 @@ class TestTTSDataset(unittest.TestCase): def _create_dataloader(self, batch_size, r, bgs): items = ljspeech(c.data_path, "metadata.csv") + + # add a default language because now the TTSDataset expect a language + language = "" + items = [[*item, language] for item in items] + dataset = TTSDataset( r, c.text_cleaner, From 240356cd53d22c6ee3cdd7493459c6bd981bee2b Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 17 Aug 2021 07:41:21 -0300 Subject: [PATCH 010/220] Fix bugs in the non-multilingual VITS inference --- TTS/tts/models/vits.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 6fe60fa0..f6442800 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -448,7 +448,8 @@ class Vits(BaseTTS): g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] # language embedding - if self.args.use_language_embedding: + lang_emb=None + if self.args.use_language_embedding and lid is not None: lang_emb = self.emb_l(lid).unsqueeze(-1) x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb) @@ -530,6 +531,7 @@ class Vits(BaseTTS): g = self.emb_g(sid).unsqueeze(-1) # language embedding + lang_emb=None if self.args.use_language_embedding and lid is not None: lang_emb = self.emb_l(lid).unsqueeze(-1) From 56b548835dc0fb366cd9b3d903cae2c54705fbae Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 19 Aug 2021 09:59:41 -0300 Subject: [PATCH 011/220] Fix bug in VITS multilingual inference --- TTS/tts/models/vits.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index f6442800..01eb1874 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -382,8 +382,13 @@ class Vits(BaseTTS): # init language embedding layer if config.use_language_embedding: + if config.num_languages > 0 and self.language_manager.num_languages == 0: + self.num_languages = config.num_languages + else: + self.num_languages = self.language_manager.num_languages + self.embedded_language_dim = config.embedded_language_dim - self.emb_l = nn.Embedding(self.language_manager.num_languages, self.embedded_language_dim) + self.emb_l = nn.Embedding(self.num_languages, self.embedded_language_dim) torch.nn.init.xavier_uniform_(self.emb_l.weight) else: self.embedded_language_dim = 0 From d653227e59317c4675d74609b5ac1503551323d5 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 19 Aug 2021 14:05:42 -0300 Subject: [PATCH 012/220] Add voice conversion support for the model VITS trained with external speaker embedding --- TTS/tts/models/vits.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 01eb1874..e7305fb8 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -564,12 +564,21 @@ class Vits(BaseTTS): outputs = {"model_outputs": o, "alignments": attn.squeeze(1), "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p} return outputs - def voice_conversion(self, y, y_lengths, sid_src, sid_tgt): + def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt): """TODO: create an end-point for voice conversion""" assert self.num_speakers > 0, "num_speakers have to be larger than 0." - g_src = self.emb_g(sid_src).unsqueeze(-1) - g_tgt = self.emb_g(sid_tgt).unsqueeze(-1) - z, _, _, y_mask = self.enc_q(y, y_lengths, g=g_src) + + # speaker embedding + if self.args.use_speaker_embedding and not self.use_d_vector: + g_src = self.emb_g(speaker_cond_src).unsqueeze(-1) + g_tgt = self.emb_g(speaker_cond_tgt).unsqueeze(-1) + elif self.args.use_speaker_embedding and self.use_d_vector: + g_src = F.normalize(speaker_cond_src).unsqueeze(-1) + g_tgt = F.normalize(speaker_cond_tgt).unsqueeze(-1) + else: + raise RuntimeError(" [!] Voice conversion is only supported on multi-speaker models.") + + z, _, _, y_mask = self.posterior_encoder(y, y_lengths, g=g_src) z_p = self.flow(z, y_mask, g=g_src) z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) o_hat = self.waveform_decoder(z_hat * y_mask, g=g_tgt) From 3df5d9a619d3c860452944acbd3edb524923da98 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 23 Aug 2021 16:12:31 -0300 Subject: [PATCH 013/220] Fix the bug in M-AILABS formatter --- TTS/tts/datasets/dataset.py | 1 + TTS/tts/datasets/formatters.py | 16 +++++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 7ba97eba..78c6c33d 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -125,6 +125,7 @@ class TTSDataset(Dataset): self.d_vector_mapping = d_vector_mapping self.language_id_mapping = language_id_mapping self.use_noise_augment = use_noise_augment + self.verbose = verbose self.input_seq_computed = False self.rescue_item_idx = 1 diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 51ad892a..651b3197 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -68,14 +68,19 @@ def mailabs(root_path, meta_files=None): recursively. Defaults to None """ speaker_regex = re.compile("by_book/(male|female)/(?P[^/]+)/") - if meta_files is None: + if not meta_files: csv_files = glob(root_path + "/**/metadata.csv", recursive=True) else: csv_files = meta_files + # meta_files = [f.strip() for f in meta_files.split(",")] items = [] for csv_file in csv_files: - txt_file = os.path.join(root_path, csv_file) + if os.path.isfile(csv_file): + txt_file = csv_file + else: + txt_file = os.path.join(root_path, csv_file) + folder = os.path.dirname(txt_file) # determine speaker based on folder structure... speaker_name_match = speaker_regex.search(txt_file) @@ -90,7 +95,7 @@ def mailabs(root_path, meta_files=None): with open(txt_file, "r", encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") - if meta_files is None: + if not meta_files: wav_file = os.path.join(folder, "wavs", cols[0] + ".wav") else: wav_file = os.path.join(root_path, folder.replace("metadata.csv", ""), "wavs", cols[0] + ".wav") @@ -98,7 +103,8 @@ def mailabs(root_path, meta_files=None): text = cols[1].strip() items.append([text, wav_file, speaker_name]) else: - raise RuntimeError("> File %s does not exist!" % (wav_file)) + # M-AI-Labs have some missing samples, so just print the warning + print("> File %s does not exist!" % (wav_file)) return items @@ -214,7 +220,7 @@ def common_voice(root_path, meta_file, ununsed_speakers=None): def libri_tts(root_path, meta_files=None, ununsed_speakers=None): """https://ai.google/tools/datasets/libri-tts/""" items = [] - if meta_files is None: + if not meta_files: meta_files = glob(f"{root_path}/**/*trans.tsv", recursive=True) else: if isinstance(meta_files, str): From 9071bf326f1bf670e97457d2ef469d90e06021d4 Mon Sep 17 00:00:00 2001 From: Edresson Date: Wed, 25 Aug 2021 16:52:02 -0300 Subject: [PATCH 014/220] Implement vocoder Fine Tuning like SC-GlowTTS paper --- TTS/tts/layers/losses.py | 9 ++- TTS/tts/models/vits.py | 140 ++++++++++++++++++++++++++++++++++----- 2 files changed, 133 insertions(+), 16 deletions(-) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 0ea342e8..145cd1a0 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -598,6 +598,7 @@ class VitsGeneratorLoss(nn.Module): feats_disc_fake, feats_disc_real, loss_duration, + fine_tuning_mode=False, ): """ Shapes: @@ -619,9 +620,15 @@ class VitsGeneratorLoss(nn.Module): mel = self.stft(waveform) mel_hat = self.stft(waveform_hat) # compute losses + + # ignore tts model loss if fine tunning mode is on + if fine_tuning_mode: + loss_kl = 0.0 + else: + loss_kl = self.kl_loss(z_p, logs_q, m_p, logs_p, z_mask.unsqueeze(1)) * self.kl_loss_alpha + loss_feat = self.feature_loss(feats_disc_fake, feats_disc_real) * self.feat_loss_alpha loss_gen = self.generator_loss(scores_disc_fake)[0] * self.gen_loss_alpha - loss_kl = self.kl_loss(z_p, logs_q, m_p, logs_p, z_mask.unsqueeze(1)) * self.kl_loss_alpha loss_mel = torch.nn.functional.l1_loss(mel, mel_hat) * self.mel_loss_alpha loss_duration = torch.sum(loss_duration.float()) * self.dur_loss_alpha loss = loss_kl + loss_feat + loss_mel + loss_gen + loss_duration diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index e7305fb8..ce75d6dd 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -193,6 +193,7 @@ class VitsArgs(Coqpit): use_language_embedding: bool = False embedded_language_dim: int = 4 num_languages: int = 0 + fine_tuning_mode: bool = False class Vits(BaseTTS): @@ -330,6 +331,7 @@ class Vits(BaseTTS): if args.init_discriminator: self.disc = VitsDiscriminator(use_spectral_norm=args.use_spectral_norm_disriminator) + print("FINE TUNING:", self.args.fine_tuning_mode) def init_multispeaker(self, config: Coqpit): """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer @@ -521,6 +523,90 @@ class Vits(BaseTTS): ) return outputs + def forward_fine_tuning( + self, + x: torch.tensor, + x_lengths: torch.tensor, + y: torch.tensor, + y_lengths: torch.tensor, + aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}, + ) -> Dict: + """Forward pass of the model. + + Args: + x (torch.tensor): Batch of input character sequence IDs. + x_lengths (torch.tensor): Batch of input character sequence lengths. + y (torch.tensor): Batch of input spectrograms. + y_lengths (torch.tensor): Batch of input spectrogram lengths. + aux_input (dict, optional): Auxiliary inputs for multi-speaker training. Defaults to {"d_vectors": None, "speaker_ids": None}. + + Returns: + Dict: model outputs keyed by the output name. + + Shapes: + - x: :math:`[B, T_seq]` + - x_lengths: :math:`[B]` + - y: :math:`[B, C, T_spec]` + - y_lengths: :math:`[B]` + - d_vectors: :math:`[B, C, 1]` + - speaker_ids: :math:`[B]` + """ + with torch.no_grad(): + outputs = {} + sid, g, lid = self._set_cond_input(aux_input) + # speaker embedding + if self.args.use_speaker_embedding and sid is not None and not self.use_d_vector: + g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] + + # language embedding + lang_emb=None + if self.args.use_language_embedding and lid is not None: + lang_emb = self.emb_l(lid).unsqueeze(-1) + + x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb) + + # posterior encoder + z, m_q, logs_q, y_mask = self.posterior_encoder(y, y_lengths, g=g) + + # flow layers + z_p = self.flow(z, y_mask, g=g) + + # find the alignment path + attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) + with torch.no_grad(): + o_scale = torch.exp(-2 * logs_p) + # logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1]).unsqueeze(-1) # [b, t, 1] + logp2 = torch.einsum("klm, kln -> kmn", [o_scale, -0.5 * (z_p ** 2)]) + logp3 = torch.einsum("klm, kln -> kmn", [m_p * o_scale, z_p]) + # logp4 = torch.sum(-0.5 * (m_p ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] + logp = logp2 + logp3 + attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() + + # expand prior + m_p = torch.einsum("klmn, kjm -> kjn", [attn, m_p]) + logs_p = torch.einsum("klmn, kjm -> kjn", [attn, logs_p]) + + # get the z after inverse decoder + # ToDo: test if using m_p the result is better (In the SC-GlowTTS paper we used mp instead z_p) + z_f_pred = self.flow(z_p, y_mask, g=g, reverse=True) + z_slice, slice_ids = rand_segment(z_f_pred, y_lengths, self.spec_segment_size) + + o = self.waveform_decoder(z_slice, g=g) + outputs.update( + { + "model_outputs": o, + "alignments": attn.squeeze(1), + "slice_ids": slice_ids, + "z": z, + "z_p": z_p, + "m_p": m_p, + "logs_p": logs_p, + "m_q": m_q, + "logs_q": logs_q, + } + ) + return outputs + def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}): """ Shapes: @@ -599,6 +685,15 @@ class Vits(BaseTTS): if optimizer_idx not in [0, 1]: raise ValueError(" [!] Unexpected `optimizer_idx`.") + # generator pass + if self.args.fine_tuning_mode: + # ToDo: find better place fot it + # force eval mode + self.eval() + # restore train mode for the vocoder part + self.waveform_decoder.train() + self.disc.train() + if optimizer_idx == 0: text_input = batch["text_input"] text_lengths = batch["text_lengths"] @@ -610,13 +705,24 @@ class Vits(BaseTTS): waveform = batch["waveform"] # generator pass - outputs = self.forward( - text_input, - text_lengths, - linear_input.transpose(1, 2), - mel_lengths, - aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids}, - ) + if self.args.fine_tuning_mode: + + # model forward + outputs = self.forward_fine_tuning( + text_input, + text_lengths, + linear_input.transpose(1, 2), + mel_lengths, + aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids}, + ) + else: + outputs = self.forward( + text_input, + text_lengths, + linear_input.transpose(1, 2), + mel_lengths, + aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids}, + ) # cache tensors for the discriminator self.y_disc_cache = None @@ -649,15 +755,17 @@ class Vits(BaseTTS): feats_disc_fake=outputs["feats_disc_fake"], feats_disc_real=outputs["feats_disc_real"], loss_duration=outputs["loss_duration"], + fine_tuning_mode=self.args.fine_tuning_mode, ) - - # handle the duration loss - if self.args.use_sdp: - loss_dict["nll_duration"] = outputs["nll_duration"] - loss_dict["loss"] += outputs["nll_duration"] - else: - loss_dict["loss_duration"] = outputs["loss_duration"] - loss_dict["loss"] += outputs["loss_duration"] + # ignore duration loss if fine tuning mode is on + if not self.args.fine_tuning_mode: + # handle the duration loss + if self.args.use_sdp: + loss_dict["nll_duration"] = outputs["nll_duration"] + loss_dict["loss"] += outputs["nll_duration"] + else: + loss_dict["loss_duration"] = outputs["loss_duration"] + loss_dict["loss"] += outputs["loss_duration"] elif optimizer_idx == 1: # discriminator pass @@ -853,3 +961,5 @@ class Vits(BaseTTS): if eval: self.eval() assert not self.training + + From cfa9910f9da97c1d5cde9769627e766c8976885e Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 26 Aug 2021 07:55:41 -0300 Subject: [PATCH 015/220] Fix pylint issues --- TTS/tts/models/vits.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index ce75d6dd..72c4c892 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -706,7 +706,6 @@ class Vits(BaseTTS): # generator pass if self.args.fine_tuning_mode: - # model forward outputs = self.forward_fine_tuning( text_input, @@ -961,5 +960,3 @@ class Vits(BaseTTS): if eval: self.eval() assert not self.training - - From 82611cfcd3a39a878bf5aa644e22e611184eec3b Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 26 Aug 2021 11:36:11 -0300 Subject: [PATCH 016/220] Fix unit tests --- TTS/tts/models/vits.py | 1 - 1 file changed, 1 deletion(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 72c4c892..bc4bf235 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -331,7 +331,6 @@ class Vits(BaseTTS): if args.init_discriminator: self.disc = VitsDiscriminator(use_spectral_norm=args.use_spectral_norm_disriminator) - print("FINE TUNING:", self.args.fine_tuning_mode) def init_multispeaker(self, config: Coqpit): """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer From 859cf1bfac15892435b08d8d604060c556b027db Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 26 Aug 2021 12:18:05 -0300 Subject: [PATCH 017/220] Add VITS multilingual unit test --- .../tts_tests/test_vits_multilingual_train.py | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 tests/tts_tests/test_vits_multilingual_train.py diff --git a/tests/tts_tests/test_vits_multilingual_train.py b/tests/tts_tests/test_vits_multilingual_train.py new file mode 100644 index 00000000..5fc4787d --- /dev/null +++ b/tests/tts_tests/test_vits_multilingual_train.py @@ -0,0 +1,66 @@ +import glob +import os +import shutil + +from tests import get_device_id, get_tests_output_path, run_cli +from TTS.tts.configs import BaseDatasetConfig, VitsConfig + +config_path = os.path.join(get_tests_output_path(), "test_model_config.json") +output_path = os.path.join(get_tests_output_path(), "train_outputs") + + +dataset_config1 = BaseDatasetConfig( + name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", path="tests/data/ljspeech", language="en" +) + +dataset_config2 = BaseDatasetConfig( + name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", path="tests/data/ljspeech", language="en2" +) + +config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + use_espeak_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech", None, "en"], + ["Be a voice, not an echo.", "ljspeech", None, "en2"], + ], + datasets=[dataset_config1, dataset_config2], +) +# set audio config +config.audio.do_trim_silence = True +config.audio.trim_db = 60 + +# active multilingual mode +config.model_args.use_language_embedding = True +# active language sampler +config.use_language_weighted_sampler = True + +config.save_json(config_path) + +# train the model for one epoch +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.test_delay_epochs 0" +) +run_cli(command_train) + +# Find latest folder +continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + +# restore the model and continue training for one more epoch +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " +run_cli(command_train) +shutil.rmtree(continue_path) From 08da902af33e6f7f26e95679857f3fd00c04c617 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 26 Aug 2021 12:19:01 -0300 Subject: [PATCH 018/220] Add VITS d-vector unit test --- tests/tts_tests/test_vits_d-vectors_train.py | 63 ++++++++++++++++++++ tests/tts_tests/test_vits_train.py | 2 +- 2 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 tests/tts_tests/test_vits_d-vectors_train.py diff --git a/tests/tts_tests/test_vits_d-vectors_train.py b/tests/tts_tests/test_vits_d-vectors_train.py new file mode 100644 index 00000000..af0e0eba --- /dev/null +++ b/tests/tts_tests/test_vits_d-vectors_train.py @@ -0,0 +1,63 @@ +import glob +import os +import shutil + +from tests import get_device_id, get_tests_output_path, run_cli +from TTS.tts.configs import VitsConfig + +config_path = os.path.join(get_tests_output_path(), "test_model_config.json") +output_path = os.path.join(get_tests_output_path(), "train_outputs") + + +config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + use_espeak_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech-0"], + ], +) +# set audio config +config.audio.do_trim_silence = True +config.audio.trim_db = 60 + +# active multispeaker d-vec mode +config.model_args.use_speaker_embedding = True +config.model_args.use_d_vector_file = True +config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json" +config.model_args.d_vector_dim = 256 + + +config.save_json(config_path) + +# train the model for one epoch +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.name ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" +) +run_cli(command_train) + +# Find latest folder +continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + +# restore the model and continue training for one more epoch +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " +run_cli(command_train) +shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_vits_train.py b/tests/tts_tests/test_vits_train.py index 6398955e..607f7b29 100644 --- a/tests/tts_tests/test_vits_train.py +++ b/tests/tts_tests/test_vits_train.py @@ -25,7 +25,7 @@ config = VitsConfig( print_step=1, print_eval=True, test_sentences=[ - "Be a voice, not an echo.", + ["Be a voice, not an echo."], ], ) config.audio.do_trim_silence = True From d7042ecfd8bb3bfbf2ba3225f4ee3898874157ec Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 26 Aug 2021 14:47:53 -0300 Subject: [PATCH 019/220] Fix d-vector multispeaker training bug --- TTS/tts/models/base_tts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index bfa6df14..c03a7df5 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -134,13 +134,13 @@ class BaseTTS(BaseModel): # get speaker id/d_vector speaker_id, d_vector, language_id = None, None, None - if hasattr(self, "speaker_manager") and config.use_speaker_embedding: + if hasattr(self, "speaker_manager"): if config.use_d_vector_file: if speaker_name is None: d_vector = self.speaker_manager.get_random_d_vector() else: d_vector = self.speaker_manager.get_d_vector_by_speaker(speaker_name) - else: + elif config.use_speaker_embedding: if speaker_name is None: speaker_id = self.speaker_manager.get_random_speaker_id() else: @@ -284,7 +284,7 @@ class BaseTTS(BaseModel): use_noise_augment=False if is_eval else config.use_noise_augment, verbose=verbose, speaker_id_mapping=speaker_id_mapping, - d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None, + d_vector_mapping=d_vector_mapping, language_id_mapping=language_id_mapping, ) From f4abb19515a1ec14e8f7c7be11066b6511ccd783 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 26 Aug 2021 16:01:07 -0300 Subject: [PATCH 020/220] Fix bug after merge --- TTS/tts/models/vits.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index bc4bf235..600a9551 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -5,6 +5,7 @@ from itertools import chain from typing import Dict, List, Tuple import torch +import math from coqpit import Coqpit from torch import nn from torch.cuda.amp.autocast_mode import autocast @@ -574,11 +575,11 @@ class Vits(BaseTTS): attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) with torch.no_grad(): o_scale = torch.exp(-2 * logs_p) - # logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1]).unsqueeze(-1) # [b, t, 1] + logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1]).unsqueeze(-1) # [b, t, 1] logp2 = torch.einsum("klm, kln -> kmn", [o_scale, -0.5 * (z_p ** 2)]) logp3 = torch.einsum("klm, kln -> kmn", [m_p * o_scale, z_p]) - # logp4 = torch.sum(-0.5 * (m_p ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] - logp = logp2 + logp3 + logp4 = torch.sum(-0.5 * (m_p ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] + logp = logp2 + logp3 + logp1 + logp4 attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() # expand prior From 256197b6aaf7f3754363aa252162f6f663c5cfaf Mon Sep 17 00:00:00 2001 From: Edresson Date: Sat, 28 Aug 2021 09:57:52 -0300 Subject: [PATCH 021/220] Fix the optimizer parameters bug in multilingual and multispeaker training --- TTS/tts/models/vits.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 600a9551..d355d5c1 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -882,8 +882,12 @@ class Vits(BaseTTS): self.waveform_decoder.parameters(), ) # add the speaker embedding layer - if hasattr(self, "emb_g"): + if hasattr(self, "emb_g") and self.args.use_speaker_embedding and not self.args.use_d_vector_file: gen_parameters = chain(gen_parameters, self.emb_g.parameters()) + # add the language embedding layer + if hasattr(self, "emb_l") and self.args.use_language_embedding: + gen_parameters = chain(gen_parameters, self.emb_l.parameters()) + optimizer0 = get_optimizer( self.config.optimizer, self.config.optimizer_params, self.config.lr_gen, parameters=gen_parameters ) From 2bba769e67a318c66fd13f5148284bfd9bc2a3d2 Mon Sep 17 00:00:00 2001 From: Edresson Date: Sat, 28 Aug 2021 09:59:09 -0300 Subject: [PATCH 022/220] Active the multispeaker mode in multilingual training --- tests/tts_tests/test_vits_multilingual_train.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/tts_tests/test_vits_multilingual_train.py b/tests/tts_tests/test_vits_multilingual_train.py index 5fc4787d..10e66b81 100644 --- a/tests/tts_tests/test_vits_multilingual_train.py +++ b/tests/tts_tests/test_vits_multilingual_train.py @@ -44,6 +44,9 @@ config.audio.trim_db = 60 # active multilingual mode config.model_args.use_language_embedding = True +# active multispeaker mode +config.model_args.use_speaker_embedding = True +config.model_args.use_d_vector_file = False # active language sampler config.use_language_weighted_sampler = True From ecf327a118bc907d2f1a36743a88d4f157af4109 Mon Sep 17 00:00:00 2001 From: Edresson Date: Sat, 28 Aug 2021 09:59:48 -0300 Subject: [PATCH 023/220] Add VITS multispeaker train unit test --- .../tts_tests/test_vits_speaker_emb_train.py | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 tests/tts_tests/test_vits_speaker_emb_train.py diff --git a/tests/tts_tests/test_vits_speaker_emb_train.py b/tests/tts_tests/test_vits_speaker_emb_train.py new file mode 100644 index 00000000..7028a983 --- /dev/null +++ b/tests/tts_tests/test_vits_speaker_emb_train.py @@ -0,0 +1,63 @@ +import glob +import os +import shutil + +from tests import get_device_id, get_tests_output_path, run_cli +from TTS.tts.configs import VitsConfig + +config_path = os.path.join(get_tests_output_path(), "test_model_config.json") +output_path = os.path.join(get_tests_output_path(), "train_outputs") + + +config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + use_espeak_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech"], + ], +) +# set audio config +config.audio.do_trim_silence = True +config.audio.trim_db = 60 + +# active multispeaker d-vec mode +config.model_args.use_speaker_embedding = True +config.model_args.use_d_vector_file = False +config.model_args.d_vector_file = None +config.model_args.d_vector_dim = 256 + + +config.save_json(config_path) + +# train the model for one epoch +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.name ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" +) +run_cli(command_train) + +# Find latest folder +continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + +# restore the model and continue training for one more epoch +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " +run_cli(command_train) +shutil.rmtree(continue_path) From fee01daa094a1c6978c9b6edcf4723472480fa8f Mon Sep 17 00:00:00 2001 From: Edresson Date: Sat, 28 Aug 2021 10:11:33 -0300 Subject: [PATCH 024/220] Add the ValueError in the restore checkpoint exception to avoid problems with the optimizer restauration when new keys are addition --- TTS/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index 2175875c..e8911ba3 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -453,7 +453,7 @@ class Trainer: if "scaler" in checkpoint and self.use_amp_scaler and checkpoint["scaler"]: print(" > Restoring Scaler...") scaler = _restore_list_objs(checkpoint["scaler"], scaler) - except (KeyError, RuntimeError): + except (KeyError, RuntimeError, ValueError): print(" > Partial model initialization...") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint["model"], config) From a3901032f42b29f983b0bfb67d3f688628e7f668 Mon Sep 17 00:00:00 2001 From: Edresson Date: Wed, 1 Sep 2021 09:23:45 -0300 Subject: [PATCH 025/220] Add H/ASP original checkpoint support --- TTS/speaker_encoder/models/resnet.py | 39 ++++++++++++++++++++-- TTS/speaker_encoder/utils/generic_utils.py | 6 +++- TTS/tts/utils/speakers.py | 14 +++++--- 3 files changed, 51 insertions(+), 8 deletions(-) diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index fcc850d7..beeb5ae1 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -1,9 +1,23 @@ import numpy as np import torch -from torch import nn +import torchaudio +import torch.nn as nn from TTS.utils.io import load_fsspec +class PreEmphasis(torch.nn.Module): + def __init__(self, coefficient=0.97): + super().__init__() + self.coefficient = coefficient + self.register_buffer( + 'filter', torch.FloatTensor([-self.coefficient, 1.]).unsqueeze(0).unsqueeze(0) + ) + + def forward(self, x): + assert len(x.size()) == 2 + + x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), 'reflect') + return torch.nn.functional.conv1d(x, self.filter).squeeze(1) class SELayer(nn.Module): def __init__(self, channel, reduction=8): @@ -70,12 +84,17 @@ class ResNetSpeakerEncoder(nn.Module): num_filters=[32, 64, 128, 256], encoder_type="ASP", log_input=False, + use_torch_spec=False, + audio_config=None, ): super(ResNetSpeakerEncoder, self).__init__() self.encoder_type = encoder_type self.input_dim = input_dim self.log_input = log_input + self.use_torch_spec = use_torch_spec + self.audio_config = audio_config + self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1) self.relu = nn.ReLU(inplace=True) self.bn1 = nn.BatchNorm2d(num_filters[0]) @@ -88,6 +107,14 @@ class ResNetSpeakerEncoder(nn.Module): self.instancenorm = nn.InstanceNorm1d(input_dim) + if self.use_torch_spec: + self.torch_spec = torch.nn.Sequential( + PreEmphasis(audio_config["preemphasis"]), + torchaudio.transforms.MelSpectrogram(sample_rate=audio_config["sample_rate"], n_fft=audio_config["fft_size"], win_length=audio_config["win_length"], hop_length=audio_config["hop_length"], window_fn=torch.hamming_window, n_mels=audio_config["num_mels"]) + ) + else: + self.torch_spec = None + outmap_size = int(self.input_dim / 8) self.attention = nn.Sequential( @@ -140,9 +167,13 @@ class ResNetSpeakerEncoder(nn.Module): return out def forward(self, x, l2_norm=False): - x = x.transpose(1, 2) with torch.no_grad(): with torch.cuda.amp.autocast(enabled=False): + if self.use_torch_spec: + x = self.torch_spec(x) + else: + x = x.transpose(1, 2) + if self.log_input: x = (x + 1e-6).log() x = self.instancenorm(x).unsqueeze(1) @@ -180,6 +211,10 @@ class ResNetSpeakerEncoder(nn.Module): Generate embeddings for a batch of utterances x: 1xTxD """ + # map to the waveform size + if self.use_torch_spec: + num_frames = num_frames * self.audio_config['hop_length'] + max_len = x.shape[1] if max_len < num_frames: diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/speaker_encoder/utils/generic_utils.py index 1981fbe9..3714e3c4 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/speaker_encoder/utils/generic_utils.py @@ -179,7 +179,11 @@ def setup_model(c): c.model_params["num_lstm_layers"], ) elif c.model_params["model_name"].lower() == "resnet": - model = ResNetSpeakerEncoder(input_dim=c.model_params["input_dim"], proj_dim=c.model_params["proj_dim"]) + model = ResNetSpeakerEncoder(input_dim=c.model_params["input_dim"], proj_dim=c.model_params["proj_dim"], + log_input=c.model_params.get("log_input", False), + use_torch_spec=c.model_params.get("use_torch_spec", False), + audio_config=c.audio + ) return model diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 1497ca74..282875af 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -288,12 +288,16 @@ class SpeakerManager: def _compute(wav_file: str): waveform = self.speaker_encoder_ap.load_wav(wav_file, sr=self.speaker_encoder_ap.sample_rate) - spec = self.speaker_encoder_ap.melspectrogram(waveform) - spec = torch.from_numpy(spec.T) + if not self.speaker_encoder_config.model_params.get("use_torch_spec", False): + m_input = self.speaker_encoder_ap.melspectrogram(waveform) + m_input = torch.from_numpy(m_input.T) + else: + m_input = torch.from_numpy(waveform) + if self.use_cuda: - spec = spec.cuda() - spec = spec.unsqueeze(0) - d_vector = self.speaker_encoder.compute_embedding(spec) + m_input = m_input.cuda() + m_input = m_input.unsqueeze(0) + d_vector = self.speaker_encoder.compute_embedding(m_input) return d_vector if isinstance(wav_file, list): From 3cd889a9d436c2640aeb4e7f1fc628501e7effec Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 3 Sep 2021 07:37:43 -0300 Subject: [PATCH 026/220] Add support to use the speaker encoder as loss function in VITS model --- TTS/tts/configs/vits_config.py | 1 + TTS/tts/layers/losses.py | 10 ++++++ TTS/tts/models/vits.py | 57 +++++++++++++++++++++++++++------- 3 files changed, 56 insertions(+), 12 deletions(-) diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index cc3e4940..ece414a6 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -117,6 +117,7 @@ class VitsConfig(BaseTTSConfig): feat_loss_alpha: float = 1.0 mel_loss_alpha: float = 45.0 dur_loss_alpha: float = 1.0 + speaker_encoder_loss_alpha: float = 1.0 # data loader params return_wav: bool = True diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 145cd1a0..fdee9c10 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -532,6 +532,7 @@ class VitsGeneratorLoss(nn.Module): self.feat_loss_alpha = c.feat_loss_alpha self.dur_loss_alpha = c.dur_loss_alpha self.mel_loss_alpha = c.mel_loss_alpha + self.spk_encoder_loss_alpha = c.speaker_encoder_loss_alpha self.stft = TorchSTFT( c.audio.fft_size, c.audio.hop_length, @@ -599,6 +600,9 @@ class VitsGeneratorLoss(nn.Module): feats_disc_real, loss_duration, fine_tuning_mode=False, + use_speaker_encoder_as_loss=False, + gt_spk_emb=None, + syn_spk_emb=None ): """ Shapes: @@ -632,6 +636,12 @@ class VitsGeneratorLoss(nn.Module): loss_mel = torch.nn.functional.l1_loss(mel, mel_hat) * self.mel_loss_alpha loss_duration = torch.sum(loss_duration.float()) * self.dur_loss_alpha loss = loss_kl + loss_feat + loss_mel + loss_gen + loss_duration + + if use_speaker_encoder_as_loss: + loss_se = - torch.nn.functional.cosine_similarity(gt_spk_emb, syn_spk_emb).mean() * self.spk_encoder_loss_alpha + loss += loss_se + return_dict["loss_spk_encoder"] = loss_se + # pass losses to the dict return_dict["loss_gen"] = loss_gen return_dict["loss_kl"] = loss_kl diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index d355d5c1..71cc4634 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -195,6 +195,10 @@ class VitsArgs(Coqpit): embedded_language_dim: int = 4 num_languages: int = 0 fine_tuning_mode: bool = False + use_speaker_encoder_as_loss: bool = False + speaker_encoder_config_path: str = "" + speaker_encoder_model_path: str = "" + class Vits(BaseTTS): @@ -370,6 +374,18 @@ class Vits(BaseTTS): self.speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file) self.embedded_speaker_dim = config.d_vector_dim + if config.use_speaker_encoder_as_loss: + if not config.speaker_encoder_model_path or not config.speaker_encoder_config_path: + raise RuntimeError(" [!] To use the speaker encoder loss you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!") + self.speaker_manager.init_speaker_encoder(config.speaker_encoder_model_path, config.speaker_encoder_config_path) + self.speaker_encoder = self.speaker_manager.speaker_encoder.train() + for param in self.speaker_encoder.parameters(): + param.requires_grad = False + + print(" > External Speaker Encoder Loaded !!") + else: + self.speaker_encoder = None + def init_multilingual(self, config: Coqpit, data: List = None): """Initialize multilingual modules of a model. @@ -427,6 +443,7 @@ class Vits(BaseTTS): y: torch.tensor, y_lengths: torch.tensor, aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}, + waveform=None, ) -> Dict: """Forward pass of the model. @@ -461,7 +478,6 @@ class Vits(BaseTTS): x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb) - # posterior encoder z, m_q, logs_q, y_mask = self.posterior_encoder(y, y_lengths, g=g) @@ -508,17 +524,36 @@ class Vits(BaseTTS): # select a random feature segment for the waveform decoder z_slice, slice_ids = rand_segments(z, y_lengths, self.spec_segment_size) o = self.waveform_decoder(z_slice, g=g) + + wav_seg = segment( + waveform.transpose(1, 2), + slice_ids * self.config.audio.hop_length, + self.args.spec_segment_size * self.config.audio.hop_length, + ) + + if self.args.use_speaker_encoder_as_loss: + # concate generated and GT waveforms + wavs_batch = torch.cat((wav_seg, o), dim=0).squeeze(1) + pred_embs = self.speaker_encoder.forward(wavs_batch, l2_norm=True) + + # split generated and GT speaker embeddings + gt_spk_emb, syn_spk_emb = torch.chunk(pred_embs, 2, dim=0) + else: + gt_spk_emb, syn_spk_emb = None, None + outputs.update( { "model_outputs": o, "alignments": attn.squeeze(1), - "slice_ids": slice_ids, "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p, "m_q": m_q, "logs_q": logs_q, + "waveform_seg": wav_seg, + "gt_spk_emb": gt_spk_emb, + "syn_spk_emb": syn_spk_emb } ) return outputs @@ -596,7 +631,6 @@ class Vits(BaseTTS): { "model_outputs": o, "alignments": attn.squeeze(1), - "slice_ids": slice_ids, "z": z, "z_p": z_p, "m_p": m_p, @@ -713,6 +747,7 @@ class Vits(BaseTTS): linear_input.transpose(1, 2), mel_lengths, aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids}, + waveform=waveform, ) else: outputs = self.forward( @@ -721,30 +756,25 @@ class Vits(BaseTTS): linear_input.transpose(1, 2), mel_lengths, aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids}, + waveform=waveform, ) # cache tensors for the discriminator self.y_disc_cache = None self.wav_seg_disc_cache = None self.y_disc_cache = outputs["model_outputs"] - wav_seg = segment( - waveform.transpose(1, 2), - outputs["slice_ids"] * self.config.audio.hop_length, - self.args.spec_segment_size * self.config.audio.hop_length, - ) - self.wav_seg_disc_cache = wav_seg - outputs["waveform_seg"] = wav_seg + self.wav_seg_disc_cache = outputs["waveform_seg"] # compute discriminator scores and features outputs["scores_disc_fake"], outputs["feats_disc_fake"], _, outputs["feats_disc_real"] = self.disc( - outputs["model_outputs"], wav_seg + outputs["model_outputs"], outputs["waveform_seg"] ) # compute losses with autocast(enabled=False): # use float32 for the criterion loss_dict = criterion[optimizer_idx]( waveform_hat=outputs["model_outputs"].float(), - waveform=wav_seg.float(), + waveform= outputs["waveform_seg"].float(), z_p=outputs["z_p"].float(), logs_q=outputs["logs_q"].float(), m_p=outputs["m_p"].float(), @@ -755,6 +785,9 @@ class Vits(BaseTTS): feats_disc_real=outputs["feats_disc_real"], loss_duration=outputs["loss_duration"], fine_tuning_mode=self.args.fine_tuning_mode, + use_speaker_encoder_as_loss=self.args.use_speaker_encoder_as_loss, + gt_spk_emb=outputs["gt_spk_emb"], + syn_spk_emb=outputs["syn_spk_emb"] ) # ignore duration loss if fine tuning mode is on if not self.args.fine_tuning_mode: From 2be38aad3f34ee758c49a94d786a2f46c320fa4c Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 21 Jul 2021 16:49:12 +0200 Subject: [PATCH 027/220] Added a notbook for d-vector multilingual VITS --- .../VITS_d-vector_multilingual_exemple.ipynb | 223 ++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 notebooks/VITS_d-vector_multilingual_exemple.ipynb diff --git a/notebooks/VITS_d-vector_multilingual_exemple.ipynb b/notebooks/VITS_d-vector_multilingual_exemple.ipynb new file mode 100644 index 00000000..41713295 --- /dev/null +++ b/notebooks/VITS_d-vector_multilingual_exemple.ipynb @@ -0,0 +1,223 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "import IPython\n", + "import torch\n", + "\n", + "from IPython.display import Audio\n", + "\n", + "from TTS.config import load_config\n", + "from TTS.tts.models import setup_model\n", + "from TTS.tts.utils.synthesis import synthesis\n", + "from TTS.utils.audio import AudioProcessor" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "GENERAL_PATH = '/home/julian/workspace/train/VITS-pt-en-fr-lr/vits-August-29-2021_01+20PM-c68d7fa25/'\n", + "MODEL_PATH = GENERAL_PATH + 'best_model.pth.tar'\n", + "CONFIG_PATH = GENERAL_PATH + 'config.json'\n", + "TTS_LANGUAGES = GENERAL_PATH + \"language_ids.json\"\n", + "TTS_SPEAKERS = GENERAL_PATH + \"speakers.json\"\n", + "USE_CUDA = torch.cuda.is_available()" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ + "# load the config\n", + "C = load_config(CONFIG_PATH)\n", + "\n", + "# load the audio processor\n", + "ap = AudioProcessor(**C.audio)\n", + "\n", + "speaker_embedding = None\n", + "\n", + "C.model_args['d_vector_file'] = TTS_SPEAKERS\n", + "\n", + "model = setup_model(C)\n", + "model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)\n", + "cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n", + "model.load_state_dict(cp['model'])\n", + "\n", + "\n", + "model.eval()\n", + "\n", + "if USE_CUDA:\n", + " model = model.cuda()\n", + "\n", + "use_griffin_lim = True" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " > Setting up Audio Processor...\n", + " | > sample_rate:16000\n", + " | > resample:False\n", + " | > num_mels:80\n", + " | > min_level_db:-100\n", + " | > frame_shift_ms:None\n", + " | > frame_length_ms:None\n", + " | > ref_level_db:20\n", + " | > fft_size:1024\n", + " | > power:1.5\n", + " | > preemphasis:0.0\n", + " | > griffin_lim_iters:60\n", + " | > signal_norm:False\n", + " | > symmetric_norm:True\n", + " | > mel_fmin:0\n", + " | > mel_fmax:None\n", + " | > spec_gain:1.0\n", + " | > stft_pad_mode:reflect\n", + " | > max_norm:4.0\n", + " | > clip_norm:True\n", + " | > do_trim_silence:True\n", + " | > trim_db:45\n", + " | > do_sound_norm:False\n", + " | > do_amp_to_db_linear:False\n", + " | > do_amp_to_db_mel:True\n", + " | > stats_path:None\n", + " | > base:2.718281828459045\n", + " | > hop_length:256\n", + " | > win_length:1024\n", + " > Using model: vits\n", + " > Speaker manager is loaded with 421 speakers: ED, MLS_10032, MLS_10058, MLS_10065, MLS_10082, MLS_10087, MLS_10177, MLS_103, MLS_10620, MLS_10827, MLS_10957, MLS_112, MLS_11247, MLS_1127, MLS_115, MLS_11743, MLS_11772, MLS_11795, MLS_11822, MLS_11875, MLS_11954, MLS_12205, MLS_123, MLS_1243, MLS_125, MLS_12501, MLS_12512, MLS_12541, MLS_12709, MLS_12713, MLS_12823, MLS_12899, MLS_12968, MLS_12981, MLS_13142, MLS_13177, MLS_1329, MLS_13611, MLS_13634, MLS_13655, MLS_13658, MLS_14, MLS_1474, MLS_1579, MLS_1590, MLS_1591, MLS_1624, MLS_1649, MLS_1664, MLS_1745, MLS_177, MLS_1770, MLS_1798, MLS_1805, MLS_1817, MLS_1840, MLS_1844, MLS_1869, MLS_1887, MLS_1977, MLS_1989, MLS_2033, MLS_204, MLS_2155, MLS_2284, MLS_2297, MLS_2316, MLS_2506, MLS_2544, MLS_2587, MLS_2596, MLS_2607, MLS_27, MLS_2771, MLS_2776, MLS_28, MLS_2825, MLS_2904, MLS_2926, MLS_2946, MLS_30, MLS_3060, MLS_3182, MLS_3190, MLS_3204, MLS_3267, MLS_3270, MLS_3319, MLS_3344, MLS_3370, MLS_3464, MLS_3503, MLS_3595, MLS_3698, MLS_4018, MLS_4174, MLS_4193, MLS_4336, MLS_4396, MLS_4512, MLS_4609, MLS_4650, MLS_4699, MLS_4724, MLS_4744, MLS_4937, MLS_5021, MLS_5077, MLS_52, MLS_5232, MLS_5295, MLS_5525, MLS_5526, MLS_5553, MLS_5595, MLS_5612, MLS_5764, MLS_577, MLS_579, MLS_5830, MLS_5840, MLS_5968, MLS_6070, MLS_6128, MLS_62, MLS_6249, MLS_6318, MLS_6348, MLS_6362, MLS_6381, MLS_66, MLS_6856, MLS_694, MLS_7032, MLS_707, MLS_7142, MLS_7150, MLS_7193, MLS_7200, MLS_7239, MLS_7377, MLS_7423, MLS_7438, MLS_7439, MLS_753, MLS_7591, MLS_7601, MLS_7614, MLS_7679, MLS_78, MLS_7848, MLS_8102, MLS_8128, MLS_8582, MLS_8778, MLS_9121, MLS_9242, MLS_928, MLS_94, MLS_9804, MLS_9854, VCTK_p225, VCTK_p226, VCTK_p227, VCTK_p228, VCTK_p229, VCTK_p230, VCTK_p231, VCTK_p232, VCTK_p233, VCTK_p234, VCTK_p236, VCTK_p237, VCTK_p238, VCTK_p239, VCTK_p240, VCTK_p241, VCTK_p243, VCTK_p244, VCTK_p245, VCTK_p246, VCTK_p247, VCTK_p248, VCTK_p249, VCTK_p250, VCTK_p251, VCTK_p252, VCTK_p253, VCTK_p254, VCTK_p255, VCTK_p256, VCTK_p257, VCTK_p258, VCTK_p259, VCTK_p260, VCTK_p261, VCTK_p262, VCTK_p263, VCTK_p264, VCTK_p265, VCTK_p266, VCTK_p267, VCTK_p268, VCTK_p269, VCTK_p270, VCTK_p271, VCTK_p272, VCTK_p273, VCTK_p274, VCTK_p275, VCTK_p276, VCTK_p277, VCTK_p278, VCTK_p279, VCTK_p280, VCTK_p281, VCTK_p282, VCTK_p283, VCTK_p284, VCTK_p285, VCTK_p286, VCTK_p287, VCTK_p288, VCTK_p292, VCTK_p293, VCTK_p294, VCTK_p295, VCTK_p297, VCTK_p298, VCTK_p299, VCTK_p300, VCTK_p301, VCTK_p302, VCTK_p303, VCTK_p304, VCTK_p305, VCTK_p306, VCTK_p307, VCTK_p308, VCTK_p310, VCTK_p311, VCTK_p312, VCTK_p313, VCTK_p314, VCTK_p316, VCTK_p317, VCTK_p318, VCTK_p323, VCTK_p326, VCTK_p329, VCTK_p330, VCTK_p333, VCTK_p334, VCTK_p335, VCTK_p336, VCTK_p339, VCTK_p340, VCTK_p341, VCTK_p343, VCTK_p345, VCTK_p347, VCTK_p351, VCTK_p360, VCTK_p361, VCTK_p362, VCTK_p363, VCTK_p364, VCTK_p374, VCTK_p376, bernard, elodie, ezwa, gilles_g_le_blanc, nadine_eckert_boulet, openSLR_afr0184, openSLR_afr1919, openSLR_afr2418, openSLR_afr6590, openSLR_afr7130, openSLR_afr7214, openSLR_afr8148, openSLR_afr8924, openSLR_afr8963, openSLR_jvf00264, openSLR_jvf00658, openSLR_jvf01392, openSLR_jvf02059, openSLR_jvf02884, openSLR_jvf03187, openSLR_jvf04679, openSLR_jvf04715, openSLR_jvf04982, openSLR_jvf05540, openSLR_jvf06207, openSLR_jvf06510, openSLR_jvf06941, openSLR_jvf07335, openSLR_jvf07638, openSLR_jvf08002, openSLR_jvf08305, openSLR_jvf08736, openSLR_jvf09039, openSLR_jvm00027, openSLR_jvm01519, openSLR_jvm01932, openSLR_jvm02326, openSLR_jvm03314, openSLR_jvm03424, openSLR_jvm03727, openSLR_jvm04175, openSLR_jvm04285, openSLR_jvm04588, openSLR_jvm05219, openSLR_jvm05522, openSLR_jvm05667, openSLR_jvm05970, openSLR_jvm06080, openSLR_jvm06383, openSLR_jvm07765, openSLR_jvm07875, openSLR_jvm08178, openSLR_jvm09724, openSLR_sso0145, openSLR_sso0493, openSLR_sso0806, openSLR_sso1266, openSLR_sso1367, openSLR_sso1801, openSLR_sso2388, openSLR_sso2910, openSLR_sso4592, openSLR_sso5945, openSLR_sso6499, openSLR_sso7801, openSLR_sso7821, openSLR_sso7876, openSLR_sso7912, openSLR_sso7934, openSLR_sso8596, openSLR_sso8777, openSLR_sso9892, openSLR_suf00297, openSLR_suf00600, openSLR_suf00691, openSLR_suf00994, openSLR_suf01056, openSLR_suf01359, openSLR_suf02092, openSLR_suf02395, openSLR_suf02953, openSLR_suf03712, openSLR_suf03887, openSLR_suf04190, openSLR_suf04646, openSLR_suf04748, openSLR_suf05051, openSLR_suf05507, openSLR_suf06543, openSLR_suf07302, openSLR_suf08338, openSLR_suf08703, openSLR_sum00060, openSLR_sum00454, openSLR_sum01038, openSLR_sum01552, openSLR_sum01596, openSLR_sum01855, openSLR_sum01899, openSLR_sum02716, openSLR_sum03391, openSLR_sum03650, openSLR_sum03694, openSLR_sum04208, openSLR_sum04511, openSLR_sum05186, openSLR_sum06003, openSLR_sum06047, openSLR_sum07842, openSLR_sum08659, openSLR_sum09243, openSLR_sum09637, openSLR_sum09757, openSLR_tsn0045, openSLR_tsn0378, openSLR_tsn0441, openSLR_tsn1483, openSLR_tsn1498, openSLR_tsn1932, openSLR_tsn2839, openSLR_tsn3342, openSLR_tsn3629, openSLR_tsn4506, openSLR_tsn4850, openSLR_tsn5628, openSLR_tsn6116, openSLR_tsn6206, openSLR_tsn6234, openSLR_tsn6459, openSLR_tsn7674, openSLR_tsn7693, openSLR_tsn7866, openSLR_tsn7896, openSLR_tsn8333, openSLR_tsn8512, openSLR_tsn8532, openSLR_tsn8914, openSLR_tsn9061, openSLR_tsn9365, openSLR_xho0050, openSLR_xho0120, openSLR_xho1547, openSLR_xho3616, openSLR_xho4280, openSLR_xho4291, openSLR_xho5378, openSLR_xho5680, openSLR_xho6975, openSLR_xho7590, openSLR_xho7599, openSLR_xho9446, zeckou\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "#set speaker\n", + "d_vector = model.speaker_manager.get_mean_d_vector('VCTK_p260')" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 5, + "source": [ + "model.language_manager.language_id_mapping" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'af': 0,\n", + " 'en': 1,\n", + " 'fr-fr': 2,\n", + " 'jv': 3,\n", + " 'pt-br': 4,\n", + " 'st': 5,\n", + " 'su': 6,\n", + " 'tn': 7,\n", + " 'xh': 8}" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ], + "metadata": { + "scrolled": true + } + }, + { + "cell_type": "code", + "execution_count": 6, + "source": [ + "# set scales \n", + "model.noise_scale = 0.0 # defines the noise variance applied to the random z vector at inference.\n", + "model.length_scale = 1.0 # scaler for the duration predictor. The larger it is, the slower the speech.\n", + "model.noise_scale_w = 0.0 # defines the noise variance applied to the duration predictor z vector at inference.\n", + "model.inference_noise_scale = 0.5 # defines the noise variance applied to the random z vector at inference.\n", + "model.inference_noise_scale_dp = 0.6 # defines the noise variance applied to the duration predictor z vector at inference." + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 7, + "source": [ + "text = \"Il m'a fallu beaucoup de temps pour développer une voix, et maintenant que je l'ai, je ne vais pas me taire.\"\n", + "language_id = 2\n", + "wav, alignment, _, _ = synthesis(\n", + " model,\n", + " text,\n", + " C,\n", + " \"cuda\" in str(next(model.parameters()).device),\n", + " ap,\n", + " speaker_id=None,\n", + " d_vector=d_vector,\n", + " style_wav=None,\n", + " language_id=language_id,\n", + " enable_eos_bos_chars=C.enable_eos_bos_chars,\n", + " use_griffin_lim=True,\n", + " do_trim_silence=False,\n", + " ).values()\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {} + } + ], + "metadata": {} + } + ], + "metadata": { + "interpreter": { + "hash": "b925b73899c1545aa2d9bbcf4e8e1df4138a367d2daefc2707570579325ca4c0" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.10 64-bit ('TTS': conda)" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file From cd7639ca70bdd561068699b8b5dff36d86f66286 Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 14 Sep 2021 17:27:00 -0300 Subject: [PATCH 028/220] Add voice conversion fine tuning mode --- TTS/bin/find_unique_phonemes.py | 63 +++++++++++++++++++++++++++++++++ TTS/tts/layers/losses.py | 2 +- TTS/tts/models/vits.py | 59 +++++++++++++++++++++++++++--- 3 files changed, 119 insertions(+), 5 deletions(-) create mode 100644 TTS/bin/find_unique_phonemes.py diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py new file mode 100644 index 00000000..7ed79b36 --- /dev/null +++ b/TTS/bin/find_unique_phonemes.py @@ -0,0 +1,63 @@ +"""Find all the unique characters in a dataset""" +import argparse +from argparse import RawTextHelpFormatter + +from TTS.config import load_config +from TTS.tts.datasets import load_meta_data + +import numpy +import multiprocessing +from TTS.tts.utils.text import text2phone +from tqdm.contrib.concurrent import process_map + +def compute_phonemes(item): + try: + text = item[0] + language = item[-1] + ph = text2phone(text, language, use_espeak_phonemes=c.use_espeak_phonemes).split("|") + except: + return [] + return list(set(ph)) + +def main(): + global c + # pylint: disable=bad-option-value + parser = argparse.ArgumentParser( + description="""Find all the unique characters or phonemes in a dataset.\n\n""" + """ + Example runs: + + python TTS/bin/find_unique_chars.py --config_path config.json + """, + formatter_class=RawTextHelpFormatter, + ) + parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True) + args = parser.parse_args() + + c = load_config(args.config_path) + + # load all datasets + train_items, eval_items = load_meta_data(c.datasets, eval_split=True) + items = train_items + eval_items + print("Num items:", len(items)) + # items = items[:1000] + + phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15) + phones = [] + for ph in phonemes: + phones.extend(ph) + phones = set(phones) + lower_phones = filter(lambda c: c.islower(), phones) + phones_force_lower = [c.lower() for c in phones] + phones_force_lower = set(phones_force_lower) + + + + print(f" > Number of unique phonemes: {len(phones)}") + print(f" > Unique phonemes: {''.join(sorted(phones))}") + print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}") + print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}") + + +if __name__ == "__main__": + main() diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index fdee9c10..cd2903b0 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -599,7 +599,7 @@ class VitsGeneratorLoss(nn.Module): feats_disc_fake, feats_disc_real, loss_duration, - fine_tuning_mode=False, + fine_tuning_mode=0, use_speaker_encoder_as_loss=False, gt_spk_emb=None, syn_spk_emb=None diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 71cc4634..a9078b26 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -149,6 +149,28 @@ class VitsArgs(Coqpit): detach_dp_input (bool): Detach duration predictor's input from the network for stopping the gradients. Defaults to True. + + use_language_embedding (bool): + Enable/Disable language embedding for multilingual models. Defaults to False. + + embedded_language_dim (int): + Number of language embedding channels. Defaults to 4. + + num_languages (int): + Number of languages for the language embedding layer. Defaults to 0. + + use_speaker_encoder_as_loss (bool): + + + use_speaker_encoder_as_loss: bool = False + speaker_encoder_config_path: str = "" + speaker_encoder_model_path: str = "" + + fine_tuning_mode (int): + Fine tuning only the vocoder part of the model, while the rest will be frozen. Defaults to 0. + Mode 0: disabled; + Mode 1: uses the distribution predicted by the encoder and It's recommended for TTS; + Mode 2: uses the distribution predicted by the encoder and It's recommended for voice conversion. """ num_chars: int = 100 @@ -194,10 +216,10 @@ class VitsArgs(Coqpit): use_language_embedding: bool = False embedded_language_dim: int = 4 num_languages: int = 0 - fine_tuning_mode: bool = False use_speaker_encoder_as_loss: bool = False speaker_encoder_config_path: str = "" speaker_encoder_model_path: str = "" + fine_tuning_mode: int = 0 @@ -565,6 +587,7 @@ class Vits(BaseTTS): y: torch.tensor, y_lengths: torch.tensor, aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}, + waveform=None, ) -> Dict: """Forward pass of the model. @@ -621,22 +644,50 @@ class Vits(BaseTTS): m_p = torch.einsum("klmn, kjm -> kjn", [attn, m_p]) logs_p = torch.einsum("klmn, kjm -> kjn", [attn, logs_p]) - # get the z after inverse decoder - # ToDo: test if using m_p the result is better (In the SC-GlowTTS paper we used mp instead z_p) - z_f_pred = self.flow(z_p, y_mask, g=g, reverse=True) + # mode 1: like SC-GlowTTS paper; mode 2: recommended for voice conversion + if self.args.fine_tuning_mode == 1: + z_ft = m_p + elif self.args.fine_tuning_mode == 2: + z_ft = z_p + else: + raise RuntimeError(" [!] Invalid Fine Tunning Mode !") + + # inverse decoder and get the output + z_f_pred = self.flow(z_ft, y_mask, g=g, reverse=True) z_slice, slice_ids = rand_segment(z_f_pred, y_lengths, self.spec_segment_size) o = self.waveform_decoder(z_slice, g=g) + + wav_seg = segment( + waveform.transpose(1, 2), + slice_ids * self.config.audio.hop_length, + self.args.spec_segment_size * self.config.audio.hop_length, + ) + + if self.args.use_speaker_encoder_as_loss: + # concate generated and GT waveforms + wavs_batch = torch.cat((wav_seg, o), dim=0).squeeze(1) + pred_embs = self.speaker_encoder.forward(wavs_batch, l2_norm=True) + + # split generated and GT speaker embeddings + gt_spk_emb, syn_spk_emb = torch.chunk(pred_embs, 2, dim=0) + else: + gt_spk_emb, syn_spk_emb = None, None + outputs.update( { "model_outputs": o, "alignments": attn.squeeze(1), + "loss_duration": 0.0, "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p, "m_q": m_q, "logs_q": logs_q, + "waveform_seg": wav_seg, + "gt_spk_emb": gt_spk_emb, + "syn_spk_emb": syn_spk_emb } ) return outputs From 56480360cf0aa0527e2010f7cb087998cf664cc5 Mon Sep 17 00:00:00 2001 From: Edresson Date: Sun, 19 Sep 2021 13:29:09 -0300 Subject: [PATCH 029/220] Update the VITS model docs --- TTS/tts/models/vits.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index a9078b26..334e4526 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -159,16 +159,18 @@ class VitsArgs(Coqpit): num_languages (int): Number of languages for the language embedding layer. Defaults to 0. - use_speaker_encoder_as_loss (bool): - + use_speaker_encoder_as_loss (bool): + Enable/Disable Speaker Consistency Loss (SCL). Defaults to False. - use_speaker_encoder_as_loss: bool = False - speaker_encoder_config_path: str = "" - speaker_encoder_model_path: str = "" + speaker_encoder_config_path (str): + Path to the file speaker encoder config file, to use for SCL. Defaults to "". + + speaker_encoder_model_path (str): + Path to the file speaker encoder checkpoint file, to use for SCL. Defaults to "". fine_tuning_mode (int): Fine tuning only the vocoder part of the model, while the rest will be frozen. Defaults to 0. - Mode 0: disabled; + Mode 0: Disabled; Mode 1: uses the distribution predicted by the encoder and It's recommended for TTS; Mode 2: uses the distribution predicted by the encoder and It's recommended for voice conversion. """ From 9d2c445e3db311c0d787af807b30178408a953b9 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Sun, 19 Sep 2021 23:34:38 +0200 Subject: [PATCH 030/220] get_speaker_weighted_sampler --- TTS/tts/models/base_tts.py | 5 ++++- TTS/tts/utils/speakers.py | 9 +++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index c03a7df5..9d722222 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -12,7 +12,7 @@ from torch.utils.data.distributed import DistributedSampler from TTS.model import BaseModel from TTS.tts.configs.shared_configs import CharactersConfig from TTS.tts.datasets.dataset import TTSDataset -from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager +from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager, get_speaker_weighted_sampler from TTS.tts.utils.languages import LanguageManager, get_language_weighted_sampler from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text import make_symbols @@ -334,6 +334,9 @@ class BaseTTS(BaseModel): if getattr(config, "use_language_weighted_sampler", False): print(" > Using Language weighted sampler") sampler = get_language_weighted_sampler(dataset.items) + elif getattr(config, "use_speaker_weighted_sampler", False): + print(" > Using Language weighted sampler") + sampler = get_speaker_weighted_sampler(dataset.items) loader = DataLoader( diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 282875af..8ccbdafc 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -431,3 +431,12 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, else: speaker_manager.save_speaker_ids_to_file(out_file_path) return speaker_manager + +def get_speaker_weighted_sampler(items: list): + speaker_names = np.array([item[2] for item in items]) + unique_speaker_names = np.unique(speaker_names).tolist() + speaker_ids = [unique_speaker_names.index(l) for l in speaker_names] + speaker_count = np.array([len(np.where(speaker_names == l)[0]) for l in unique_speaker_names]) + weight_speaker = 1. / speaker_count + dataset_samples_weight = torch.from_numpy(np.array([weight_speaker[l] for l in speaker_ids])).double() + return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight)) \ No newline at end of file From de41165af46f8a6e4b617e3958afe4b7cdba44d8 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Sun, 19 Sep 2021 23:35:31 +0200 Subject: [PATCH 031/220] freeze vits parts --- TTS/tts/models/vits.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 334e4526..c24fec68 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -222,6 +222,9 @@ class VitsArgs(Coqpit): speaker_encoder_config_path: str = "" speaker_encoder_model_path: str = "" fine_tuning_mode: int = 0 + freeze_encoder: bool = False + freeze_DP: bool = False + freeze_PE: bool = False @@ -781,6 +784,20 @@ class Vits(BaseTTS): self.waveform_decoder.train() self.disc.train() + if self.args.freeze_encoder: + for param in self.text_encoder.parameters(): + param.requires_grad = False + for param in self.emb_l.parameters(): + param.requires_grad = False + + if self.args.freeze_PE: + for param in self.posterior_encoder.parameters(): + param.requires_grad = False + + if self.args.freeze_DP: + for param in self.duration_predictor.parameters(): + param.requires_grad = False + if optimizer_idx == 0: text_input = batch["text_input"] text_lengths = batch["text_lengths"] From 39aff6685efb124fc7de81535f4033ca3fbbf37e Mon Sep 17 00:00:00 2001 From: Edresson Date: Sun, 19 Sep 2021 21:06:58 -0300 Subject: [PATCH 032/220] Add freeze vocoder generator and flow-based decoder option --- TTS/tts/models/vits.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index c24fec68..212e7779 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -225,6 +225,8 @@ class VitsArgs(Coqpit): freeze_encoder: bool = False freeze_DP: bool = False freeze_PE: bool = False + freeze_flow_decoder: bool = False + freeze_waveform_decoder: bool = False @@ -787,9 +789,11 @@ class Vits(BaseTTS): if self.args.freeze_encoder: for param in self.text_encoder.parameters(): param.requires_grad = False - for param in self.emb_l.parameters(): - param.requires_grad = False - + + if hasattr(self, 'emb_l'): + for param in self.emb_l.parameters(): + param.requires_grad = False + if self.args.freeze_PE: for param in self.posterior_encoder.parameters(): param.requires_grad = False @@ -798,6 +802,14 @@ class Vits(BaseTTS): for param in self.duration_predictor.parameters(): param.requires_grad = False + if self.args.freeze_flow_decoder: + for param in self.flow.parameters(): + param.requires_grad = False + + if self.args.freeze_waveform_decoder: + for param in self.waveform_decoder.parameters(): + param.requires_grad = False + if optimizer_idx == 0: text_input = batch["text_input"] text_lengths = batch["text_lengths"] From 3ac428340d661585e18013c54114a1b87ce1e009 Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 19 Oct 2021 08:07:48 -0300 Subject: [PATCH 033/220] Add audio resample in the speaker consistency loss --- TTS/tts/models/vits.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 212e7779..f72918a5 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -5,7 +5,7 @@ from itertools import chain from typing import Dict, List, Tuple import torch -import math +import torchaudio from coqpit import Coqpit from torch import nn from torch.cuda.amp.autocast_mode import autocast @@ -159,12 +159,12 @@ class VitsArgs(Coqpit): num_languages (int): Number of languages for the language embedding layer. Defaults to 0. - use_speaker_encoder_as_loss (bool): + use_speaker_encoder_as_loss (bool): Enable/Disable Speaker Consistency Loss (SCL). Defaults to False. speaker_encoder_config_path (str): Path to the file speaker encoder config file, to use for SCL. Defaults to "". - + speaker_encoder_model_path (str): Path to the file speaker encoder checkpoint file, to use for SCL. Defaults to "". @@ -267,6 +267,7 @@ class Vits(BaseTTS): self.END2END = True self.speaker_manager = speaker_manager + self.audio_config = config["audio"] if config.__class__.__name__ == "VitsConfig": # loading from VitsConfig if "num_chars" not in config: @@ -412,7 +413,13 @@ class Vits(BaseTTS): param.requires_grad = False print(" > External Speaker Encoder Loaded !!") + + if hasattr(self.speaker_encoder, "audio_config") and self.audio_config["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"]: + self.audio_transform = torchaudio.transforms.Resample(orig_freq=self.audio_config["sample_rate"], new_freq=self.speaker_encoder.audio_config["sample_rate"]) + else: + self.audio_transform = None else: + self.audio_transform = None self.speaker_encoder = None def init_multilingual(self, config: Coqpit, data: List = None): @@ -560,9 +567,14 @@ class Vits(BaseTTS): self.args.spec_segment_size * self.config.audio.hop_length, ) - if self.args.use_speaker_encoder_as_loss: + if self.args.use_speaker_encoder_as_loss and self.speaker_encoder is not None: # concate generated and GT waveforms wavs_batch = torch.cat((wav_seg, o), dim=0).squeeze(1) + + # resample audio to speaker encoder sample_rate + if self.audio_transform is not None: + wavs_batch = self.audio_transform(wavs_batch) + pred_embs = self.speaker_encoder.forward(wavs_batch, l2_norm=True) # split generated and GT speaker embeddings @@ -671,9 +683,14 @@ class Vits(BaseTTS): self.args.spec_segment_size * self.config.audio.hop_length, ) - if self.args.use_speaker_encoder_as_loss: + if self.args.use_speaker_encoder_as_loss and self.speaker_encoder is not None: # concate generated and GT waveforms wavs_batch = torch.cat((wav_seg, o), dim=0).squeeze(1) + + # resample audio to speaker encoder sample_rate + if self.audio_transform is not None: + wavs_batch = self.audio_transform(wavs_batch) + pred_embs = self.speaker_encoder.forward(wavs_batch, l2_norm=True) # split generated and GT speaker embeddings From c80cf67d3da6a4faee5d2a72de838d83f77ca2f5 Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 26 Oct 2021 11:35:18 -0300 Subject: [PATCH 034/220] Add remove silence VAD script --- TTS/bin/remove_silence_using_vad.py | 213 ++++++++++++++++++++++++++++ requirements.txt | 1 + 2 files changed, 214 insertions(+) create mode 100755 TTS/bin/remove_silence_using_vad.py diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py new file mode 100755 index 00000000..c7541cc8 --- /dev/null +++ b/TTS/bin/remove_silence_using_vad.py @@ -0,0 +1,213 @@ +# This code is adpated from: https://github.com/wiseman/py-webrtcvad/blob/master/example.py +import os +import tqdm +import glob +import argparse +import pathlib + +import collections +import contextlib +import sys +import wave +import numpy as np +import webrtcvad +from tqdm.contrib.concurrent import process_map +import multiprocessing +from itertools import chain + +def read_wave(path): + """Reads a .wav file. + + Takes the path, and returns (PCM audio data, sample rate). + """ + with contextlib.closing(wave.open(path, 'rb')) as wf: + num_channels = wf.getnchannels() + assert num_channels == 1 + sample_width = wf.getsampwidth() + assert sample_width == 2 + sample_rate = wf.getframerate() + assert sample_rate in (8000, 16000, 32000, 48000) + pcm_data = wf.readframes(wf.getnframes()) + return pcm_data, sample_rate + + +def write_wave(path, audio, sample_rate): + """Writes a .wav file. + + Takes path, PCM audio data, and sample rate. + """ + with contextlib.closing(wave.open(path, 'wb')) as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(sample_rate) + wf.writeframes(audio) + + +class Frame(object): + """Represents a "frame" of audio data.""" + def __init__(self, bytes, timestamp, duration): + self.bytes = bytes + self.timestamp = timestamp + self.duration = duration + + +def frame_generator(frame_duration_ms, audio, sample_rate): + """Generates audio frames from PCM audio data. + + Takes the desired frame duration in milliseconds, the PCM data, and + the sample rate. + + Yields Frames of the requested duration. + """ + n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) + offset = 0 + timestamp = 0.0 + duration = (float(n) / sample_rate) / 2.0 + while offset + n < len(audio): + yield Frame(audio[offset:offset + n], timestamp, duration) + timestamp += duration + offset += n + + +def vad_collector(sample_rate, frame_duration_ms, + padding_duration_ms, vad, frames): + """Filters out non-voiced audio frames. + + Given a webrtcvad.Vad and a source of audio frames, yields only + the voiced audio. + + Uses a padded, sliding window algorithm over the audio frames. + When more than 90% of the frames in the window are voiced (as + reported by the VAD), the collector triggers and begins yielding + audio frames. Then the collector waits until 90% of the frames in + the window are unvoiced to detrigger. + + The window is padded at the front and back to provide a small + amount of silence or the beginnings/endings of speech around the + voiced frames. + + Arguments: + + sample_rate - The audio sample rate, in Hz. + frame_duration_ms - The frame duration in milliseconds. + padding_duration_ms - The amount to pad the window, in milliseconds. + vad - An instance of webrtcvad.Vad. + frames - a source of audio frames (sequence or generator). + + Returns: A generator that yields PCM audio data. + """ + num_padding_frames = int(padding_duration_ms / frame_duration_ms) + # We use a deque for our sliding window/ring buffer. + ring_buffer = collections.deque(maxlen=num_padding_frames) + # We have two states: TRIGGERED and NOTTRIGGERED. We start in the + # NOTTRIGGERED state. + triggered = False + + voiced_frames = [] + for frame in frames: + is_speech = vad.is_speech(frame.bytes, sample_rate) + + # sys.stdout.write('1' if is_speech else '0') + if not triggered: + ring_buffer.append((frame, is_speech)) + num_voiced = len([f for f, speech in ring_buffer if speech]) + # If we're NOTTRIGGERED and more than 90% of the frames in + # the ring buffer are voiced frames, then enter the + # TRIGGERED state. + if num_voiced > 0.9 * ring_buffer.maxlen: + triggered = True + # sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,)) + # We want to yield all the audio we see from now until + # we are NOTTRIGGERED, but we have to start with the + # audio that's already in the ring buffer. + for f, s in ring_buffer: + voiced_frames.append(f) + ring_buffer.clear() + else: + # We're in the TRIGGERED state, so collect the audio data + # and add it to the ring buffer. + voiced_frames.append(frame) + ring_buffer.append((frame, is_speech)) + num_unvoiced = len([f for f, speech in ring_buffer if not speech]) + # If more than 90% of the frames in the ring buffer are + # unvoiced, then enter NOTTRIGGERED and yield whatever + # audio we've collected. + if num_unvoiced > 0.9 * ring_buffer.maxlen: + #sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration)) + triggered = False + yield b''.join([f.bytes for f in voiced_frames]) + ring_buffer.clear() + voiced_frames = [] + # If we have any leftover voiced audio when we run out of input, + # yield it. + if voiced_frames: + yield b''.join([f.bytes for f in voiced_frames]) + +def remove_silence(filepath): + filename = os.path.basename(filepath) + output_path = filepath.replace(os.path.join(args.input_dir, ''),os.path.join(args.output_dir, '')) + # ignore if the file exists + if os.path.exists(output_path) and not args.force: + return False + # create all directory structure + pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True) + padding_duration_ms = 300 # default 300 + audio, sample_rate = read_wave(filepath) + vad = webrtcvad.Vad(int(args.aggressiveness)) + frames = frame_generator(30, audio, sample_rate) + frames = list(frames) + segments = vad_collector(sample_rate, 30, padding_duration_ms, vad, frames) + flag = False + segments = list(segments) + num_segments = len(segments) + + if num_segments != 0: + for i, segment in reversed(list(enumerate(segments))): + if i >= 1: + if flag == False: + concat_segment = segment + flag = True + else: + concat_segment = segment + concat_segment + else: + if flag: + segment = segment + concat_segment + write_wave(output_path, segment, sample_rate) + print(output_path) + return True + else: + print("> Just Copying the file to:", output_path) + # if fail to remove silence just write the file + write_wave(output_path, audio, sample_rate) + +def preprocess_audios(): + files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True)) + print("> Number of files: ", len(files)) + if not args.force: + print("> Ignoring files that already exist in the output directory.") + + if files: + # create threads + num_threads = multiprocessing.cpu_count() + process_map(remove_silence, files, max_workers=num_threads, chunksize=15) + else: + print("> No files Found !") + +if __name__ == "__main__": + """ + usage + python remove_silence.py -i=VCTK-Corpus-bk/ -o=../VCTK-Corpus-removed-silence -g=wav48/*/*.wav -a=2 + """ + parser = argparse.ArgumentParser() + parser.add_argument('-i', '--input_dir', type=str, default='../VCTK-Corpus', + help='Dataset root dir') + parser.add_argument('-o', '--output_dir', type=str, default='../VCTK-Corpus-removed-silence', + help='Output Dataset dir') + parser.add_argument('-f', '--force', type=bool, default=True, + help='Force the replace of exists files') + parser.add_argument('-g', '--glob', type=str, default='**/*.wav', + help='path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav') + parser.add_argument('-a', '--aggressiveness', type=int, default=2, + help='set its aggressiveness mode, which is an integer between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive.') + args = parser.parse_args() + preprocess_audios() diff --git a/requirements.txt b/requirements.txt index 3ec33ceb..140cf743 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,3 +26,4 @@ unidic-lite==1.0.8 gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=2.0.0 fsspec>=2021.04.0 pyworld +webrtcvad \ No newline at end of file From 5c8980396865e22f7a98320b53a1dc96de47f183 Mon Sep 17 00:00:00 2001 From: Julian WEBER Date: Wed, 27 Oct 2021 11:54:05 +0200 Subject: [PATCH 035/220] Merge dataset --- TTS/tts/datasets/dataset.py | 123 ------------------------------------ 1 file changed, 123 deletions(-) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 78c6c33d..ccfa70f1 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -56,10 +56,6 @@ class TTSDataset(Dataset): meta_data (list): List of dataset instances. - compute_f0 (bool): compute f0 if True. Defaults to False. - - f0_cache_path (str): Path to store f0 cache. Defaults to None. - characters (dict): `dict` of custom text characters used for converting texts to sequences. custom_symbols (list): List of custom symbols used for converting texts to sequences. Models using its own @@ -109,8 +105,6 @@ class TTSDataset(Dataset): self.cleaners = text_cleaner self.compute_linear_spec = compute_linear_spec self.return_wav = return_wav - self.compute_f0 = compute_f0 - self.f0_cache_path = f0_cache_path self.min_seq_len = min_seq_len self.max_seq_len = max_seq_len self.ap = ap @@ -339,7 +333,6 @@ class TTSDataset(Dataset): else: lengths = np.array([len(ins[0]) for ins in self.items]) - # sort items based on the sequence length in ascending order idxs = np.argsort(lengths) new_items = [] ignored = [] @@ -349,10 +342,7 @@ class TTSDataset(Dataset): ignored.append(idx) else: new_items.append(self.items[idx]) - # shuffle batch groups - # create batches with similar length items - # the larger the `batch_group_size`, the higher the length variety in a batch. if self.batch_group_size > 0: for i in range(len(new_items) // self.batch_group_size): offset = i * self.batch_group_size @@ -360,14 +350,8 @@ class TTSDataset(Dataset): temp_items = new_items[offset:end_offset] random.shuffle(temp_items) new_items[offset:end_offset] = temp_items - - if len(new_items) == 0: - raise RuntimeError(" [!] No items left after filtering.") - - # update items to the new sorted items self.items = new_items - # logging if self.verbose: print(" | > Max length sequence: {}".format(np.max(lengths))) print(" | > Min length sequence: {}".format(np.min(lengths))) @@ -554,110 +538,3 @@ class TTSDataset(Dataset): ) ) ) - - -class PitchExtractor: - """Pitch Extractor for computing F0 from wav files. - - Args: - items (List[List]): Dataset samples. - verbose (bool): Whether to print the progress. - """ - - def __init__( - self, - items: List[List], - verbose=False, - ): - self.items = items - self.verbose = verbose - self.mean = None - self.std = None - - @staticmethod - def create_pitch_file_path(wav_file, cache_path): - file_name = os.path.splitext(os.path.basename(wav_file))[0] - pitch_file = os.path.join(cache_path, file_name + "_pitch.npy") - return pitch_file - - @staticmethod - def _compute_and_save_pitch(ap, wav_file, pitch_file=None): - wav = ap.load_wav(wav_file) - pitch = ap.compute_f0(wav) - if pitch_file: - np.save(pitch_file, pitch) - return pitch - - @staticmethod - def compute_pitch_stats(pitch_vecs): - nonzeros = np.concatenate([v[np.where(v != 0.0)[0]] for v in pitch_vecs]) - mean, std = np.mean(nonzeros), np.std(nonzeros) - return mean, std - - def normalize_pitch(self, pitch): - zero_idxs = np.where(pitch == 0.0)[0] - pitch = pitch - self.mean - pitch = pitch / self.std - pitch[zero_idxs] = 0.0 - return pitch - - def denormalize_pitch(self, pitch): - zero_idxs = np.where(pitch == 0.0)[0] - pitch *= self.std - pitch += self.mean - pitch[zero_idxs] = 0.0 - return pitch - - @staticmethod - def load_or_compute_pitch(ap, wav_file, cache_path): - """ - compute pitch and return a numpy array of pitch values - """ - pitch_file = PitchExtractor.create_pitch_file_path(wav_file, cache_path) - if not os.path.exists(pitch_file): - pitch = PitchExtractor._compute_and_save_pitch(ap, wav_file, pitch_file) - else: - pitch = np.load(pitch_file) - return pitch.astype(np.float32) - - @staticmethod - def _pitch_worker(args): - item = args[0] - ap = args[1] - cache_path = args[2] - _, wav_file, *_ = item - pitch_file = PitchExtractor.create_pitch_file_path(wav_file, cache_path) - if not os.path.exists(pitch_file): - pitch = PitchExtractor._compute_and_save_pitch(ap, wav_file, pitch_file) - return pitch - return None - - def compute_pitch(self, ap, cache_path, num_workers=0): - """Compute the input sequences with multi-processing. - Call it before passing dataset to the data loader to cache the input sequences for faster data loading.""" - if not os.path.exists(cache_path): - os.makedirs(cache_path, exist_ok=True) - - if self.verbose: - print(" | > Computing pitch features ...") - if num_workers == 0: - pitch_vecs = [] - for _, item in enumerate(tqdm.tqdm(self.items)): - pitch_vecs += [self._pitch_worker([item, ap, cache_path])] - else: - with Pool(num_workers) as p: - pitch_vecs = list( - tqdm.tqdm( - p.imap(PitchExtractor._pitch_worker, [[item, ap, cache_path] for item in self.items]), - total=len(self.items), - ) - ) - pitch_mean, pitch_std = self.compute_pitch_stats(pitch_vecs) - pitch_stats = {"mean": pitch_mean, "std": pitch_std} - np.save(os.path.join(cache_path, "pitch_stats"), pitch_stats, allow_pickle=True) - - def load_pitch_stats(self, cache_path): - stats_path = os.path.join(cache_path, "pitch_stats.npy") - stats = np.load(stats_path, allow_pickle=True).item() - self.mean = stats["mean"].astype(np.float32) - self.std = stats["std"].astype(np.float32) From 3440c54bbeb2f431c0f95da026b463f96967911b Mon Sep 17 00:00:00 2001 From: Julian WEBER Date: Wed, 27 Oct 2021 12:02:02 +0200 Subject: [PATCH 036/220] get_aux_input --- TTS/tts/models/vits.py | 64 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index f72918a5..078d4973 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -387,6 +387,25 @@ class Vits(BaseTTS): if config.use_d_vector_file: self._init_d_vector(config) + # TODO: make this a function + if config.use_speaker_encoder_as_loss: + if not config.speaker_encoder_model_path or not config.speaker_encoder_config_path: + raise RuntimeError(" [!] To use the speaker encoder loss you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!") + self.speaker_manager.init_speaker_encoder(config.speaker_encoder_model_path, config.speaker_encoder_config_path) + self.speaker_encoder = self.speaker_manager.speaker_encoder.train() + for param in self.speaker_encoder.parameters(): + param.requires_grad = False + + print(" > External Speaker Encoder Loaded !!") + + if hasattr(self.speaker_encoder, "audio_config") and self.audio_config["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"]: + self.audio_transform = torchaudio.transforms.Resample(orig_freq=self.audio_config["sample_rate"], new_freq=self.speaker_encoder.audio_config["sample_rate"]) + else: + self.audio_transform = None + else: + self.audio_transform = None + self.speaker_encoder = None + def _init_speaker_embedding(self, config): # pylint: disable=attribute-defined-outside-init if config.speakers_file is not None: @@ -469,8 +488,49 @@ class Vits(BaseTTS): return sid, g, lid def get_aux_input(self, aux_input: Dict): - sid, g = self._set_cond_input(aux_input) - return {"speaker_id": sid, "style_wav": None, "d_vector": g} + sid, g, lid = self._set_cond_input(aux_input) + return {"speaker_id": sid, "style_wav": None, "d_vector": g, "language_id": lid} + + def get_aux_input_from_test_setences(self, sentence_info): + if hasattr(self.config, "model_args"): + config = self.config.model_args + else: + config = self.config + + # extract speaker and language info + text, speaker_name, style_wav, language_name = None, None, None, None + + if isinstance(sentence_info, list): + if len(sentence_info) == 1: + text = sentence_info[0] + elif len(sentence_info) == 2: + text, speaker_name = sentence_info + elif len(sentence_info) == 3: + text, speaker_name, style_wav = sentence_info + elif len(sentence_info) == 4: + text, speaker_name, style_wav, language_name = sentence_info + else: + text = sentence_info + + # get speaker id/d_vector + speaker_id, d_vector, language_id = None, None, None + if hasattr(self, "speaker_manager"): + if config.use_d_vector_file: + if speaker_name is None: + d_vector = self.speaker_manager.get_random_d_vector() + else: + d_vector = self.speaker_manager.get_d_vector_by_speaker(speaker_name) + elif config.use_speaker_embedding: + if speaker_name is None: + speaker_id = self.speaker_manager.get_random_speaker_id() + else: + speaker_id = self.speaker_manager.speaker_ids[speaker_name] + + # get language id + if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None: + language_id = self.language_manager.language_id_mapping[language_name] + + return {"text": text, "speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector, "language_id": language_id} def forward( self, From ec83ffbd7ad03b965824bcc86f8171e574831006 Mon Sep 17 00:00:00 2001 From: Julian WEBER Date: Wed, 27 Oct 2021 13:40:11 +0200 Subject: [PATCH 037/220] PitchExtractor --- TTS/tts/datasets/dataset.py | 106 ++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index ccfa70f1..635ffb38 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -105,6 +105,7 @@ class TTSDataset(Dataset): self.cleaners = text_cleaner self.compute_linear_spec = compute_linear_spec self.return_wav = return_wav + self.compute_f0 = compute_f0 self.min_seq_len = min_seq_len self.max_seq_len = max_seq_len self.ap = ap @@ -538,3 +539,108 @@ class TTSDataset(Dataset): ) ) ) + +class PitchExtractor: + """Pitch Extractor for computing F0 from wav files. + Args: + items (List[List]): Dataset samples. + verbose (bool): Whether to print the progress. + """ + + def __init__( + self, + items: List[List], + verbose=False, + ): + self.items = items + self.verbose = verbose + self.mean = None + self.std = None + + @staticmethod + def create_pitch_file_path(wav_file, cache_path): + file_name = os.path.splitext(os.path.basename(wav_file))[0] + pitch_file = os.path.join(cache_path, file_name + "_pitch.npy") + return pitch_file + + @staticmethod + def _compute_and_save_pitch(ap, wav_file, pitch_file=None): + wav = ap.load_wav(wav_file) + pitch = ap.compute_f0(wav) + if pitch_file: + np.save(pitch_file, pitch) + return pitch + + @staticmethod + def compute_pitch_stats(pitch_vecs): + nonzeros = np.concatenate([v[np.where(v != 0.0)[0]] for v in pitch_vecs]) + mean, std = np.mean(nonzeros), np.std(nonzeros) + return mean, std + + def normalize_pitch(self, pitch): + zero_idxs = np.where(pitch == 0.0)[0] + pitch = pitch - self.mean + pitch = pitch / self.std + pitch[zero_idxs] = 0.0 + return pitch + + def denormalize_pitch(self, pitch): + zero_idxs = np.where(pitch == 0.0)[0] + pitch *= self.std + pitch += self.mean + pitch[zero_idxs] = 0.0 + return pitch + + @staticmethod + def load_or_compute_pitch(ap, wav_file, cache_path): + """ + compute pitch and return a numpy array of pitch values + """ + pitch_file = PitchExtractor.create_pitch_file_path(wav_file, cache_path) + if not os.path.exists(pitch_file): + pitch = PitchExtractor._compute_and_save_pitch(ap, wav_file, pitch_file) + else: + pitch = np.load(pitch_file) + return pitch.astype(np.float32) + + @staticmethod + def _pitch_worker(args): + item = args[0] + ap = args[1] + cache_path = args[2] + _, wav_file, *_ = item + pitch_file = PitchExtractor.create_pitch_file_path(wav_file, cache_path) + if not os.path.exists(pitch_file): + pitch = PitchExtractor._compute_and_save_pitch(ap, wav_file, pitch_file) + return pitch + return None + + def compute_pitch(self, ap, cache_path, num_workers=0): + """Compute the input sequences with multi-processing. + Call it before passing dataset to the data loader to cache the input sequences for faster data loading.""" + if not os.path.exists(cache_path): + os.makedirs(cache_path, exist_ok=True) + + if self.verbose: + print(" | > Computing pitch features ...") + if num_workers == 0: + pitch_vecs = [] + for _, item in enumerate(tqdm.tqdm(self.items)): + pitch_vecs += [self._pitch_worker([item, ap, cache_path])] + else: + with Pool(num_workers) as p: + pitch_vecs = list( + tqdm.tqdm( + p.imap(PitchExtractor._pitch_worker, [[item, ap, cache_path] for item in self.items]), + total=len(self.items), + ) + ) + pitch_mean, pitch_std = self.compute_pitch_stats(pitch_vecs) + pitch_stats = {"mean": pitch_mean, "std": pitch_std} + np.save(os.path.join(cache_path, "pitch_stats"), pitch_stats, allow_pickle=True) + + def load_pitch_stats(self, cache_path): + stats_path = os.path.join(cache_path, "pitch_stats.npy") + stats = np.load(stats_path, allow_pickle=True).item() + self.mean = stats["mean"].astype(np.float32) + self.std = stats["std"].astype(np.float32) \ No newline at end of file From 21b49c3acdd26b4042c52cc498aa7b7c7a7bf3c8 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 27 Oct 2021 13:45:49 +0200 Subject: [PATCH 038/220] fix collate_fn --- TTS/tts/datasets/dataset.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 635ffb38..38af1469 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -402,8 +402,6 @@ class TTSDataset(Dataset): # convert list of dicts to dict of lists batch = {k: [dic[k] for dic in batch] for k in batch[0]} - speaker_names = [batch[idx]["speaker_name"] for idx in ids_sorted_decreasing] - # get language ids from language names if self.language_id_mapping is not None: language_names = [batch[idx]["language_name"] for idx in ids_sorted_decreasing] From 6ed55ba57e961917b5511dff7b5359c8a95ee74b Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Fri, 29 Oct 2021 17:09:10 +0200 Subject: [PATCH 039/220] fix test vits --- TTS/trainer.py | 2 +- TTS/tts/configs/vits_config.py | 22 +++---------------- TTS/tts/datasets/dataset.py | 3 +-- TTS/tts/models/base_tts.py | 9 ++++++-- TTS/tts/models/vits.py | 5 +---- tests/tts_tests/test_vits_d-vectors_train.py | 3 +-- .../tts_tests/test_vits_multilingual_train.py | 3 ++- .../tts_tests/test_vits_speaker_emb_train.py | 2 +- 8 files changed, 17 insertions(+), 32 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index e8911ba3..665f2589 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -261,7 +261,7 @@ class Trainer: self.run_get_model(self.config, get_model) if hasattr(self.model, "init_multilingual"): - self.model.init_multilingual(self.config, self.data_train + self.data_eval) + self.model.init_multilingual(self.config, self.train_samples + self.eval_samples) config = self.config.model_args if hasattr(self.config, "model_args") else self.config # save speakers json if config.use_language_embedding and self.model.language_manager.num_languages > 1: diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index ece414a6..a6f2210d 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -154,22 +154,6 @@ class VitsConfig(BaseTTSConfig): d_vector_dim: int = None def __post_init__(self): - # Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there. - if self.num_speakers > 0: - self.model_args.num_speakers = self.num_speakers - - # speaker embedding settings - if self.use_speaker_embedding: - self.model_args.use_speaker_embedding = True - if self.speakers_file: - self.model_args.speakers_file = self.speakers_file - if self.speaker_embedding_channels: - self.model_args.speaker_embedding_channels = self.speaker_embedding_channels - - # d-vector settings - if self.use_d_vector_file: - self.model_args.use_d_vector_file = True - if self.d_vector_dim is not None and self.d_vector_dim > 0: - self.model_args.d_vector_dim = self.d_vector_dim - if self.d_vector_file: - self.model_args.d_vector_file = self.d_vector_file + for key in self.model_args.keys(): + if hasattr(self, key): + self[key] = self.model_args[key] diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 38af1469..c2818897 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -404,8 +404,7 @@ class TTSDataset(Dataset): # get language ids from language names if self.language_id_mapping is not None: - language_names = [batch[idx]["language_name"] for idx in ids_sorted_decreasing] - language_ids = [self.language_id_mapping[ln] for ln in language_names] + language_ids = [self.language_id_mapping[ln] for ln in batch["language_name"]] else: language_ids = None # get pre-computed d-vectors diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 9d722222..df6c52f3 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -245,8 +245,13 @@ class BaseTTS(BaseModel): # setup multi-speaker attributes if hasattr(self, "speaker_manager") and self.speaker_manager is not None: - speaker_id_mapping = self.speaker_manager.speaker_ids if config.use_speaker_embedding else None - d_vector_mapping = self.speaker_manager.d_vectors if config.use_d_vector_file else None + if hasattr(config, "model_args"): + speaker_id_mapping = self.speaker_manager.speaker_ids if config.model_args.use_speaker_embedding else None + d_vector_mapping = self.speaker_manager.d_vectors if config.model_args.use_d_vector_file else None + config.use_d_vector_file = config.model_args.use_d_vector_file + else: + speaker_id_mapping = self.speaker_manager.speaker_ids if config.use_speaker_embedding else None + d_vector_mapping = self.speaker_manager.d_vectors if config.use_d_vector_file else None else: speaker_id_mapping = None d_vector_mapping = None diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 078d4973..bc503cb5 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -376,8 +376,7 @@ class Vits(BaseTTS): data (List, optional): Dataset items to infer number of speakers. Defaults to None. """ self.embedded_speaker_dim = 0 - if hasattr(config, "model_args"): - config = config.model_args + config = config.model_args self.num_speakers = config.num_speakers @@ -1033,7 +1032,6 @@ class Vits(BaseTTS): test_audios = {} test_figures = {} test_sentences = self.config.test_sentences - for idx, s_info in enumerate(test_sentences): try: aux_inputs = self.get_aux_input_from_test_setences(s_info) @@ -1051,7 +1049,6 @@ class Vits(BaseTTS): use_griffin_lim=True, do_trim_silence=False, ).values() - test_audios["{}-audio".format(idx)] = wav test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False) except: # pylint: disable=bare-except diff --git a/tests/tts_tests/test_vits_d-vectors_train.py b/tests/tts_tests/test_vits_d-vectors_train.py index af0e0eba..213669f5 100644 --- a/tests/tts_tests/test_vits_d-vectors_train.py +++ b/tests/tts_tests/test_vits_d-vectors_train.py @@ -3,7 +3,7 @@ import os import shutil from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs import VitsConfig +from TTS.tts.configs.vits_config import VitsConfig config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") @@ -33,7 +33,6 @@ config.audio.do_trim_silence = True config.audio.trim_db = 60 # active multispeaker d-vec mode -config.model_args.use_speaker_embedding = True config.model_args.use_d_vector_file = True config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json" config.model_args.d_vector_dim = 256 diff --git a/tests/tts_tests/test_vits_multilingual_train.py b/tests/tts_tests/test_vits_multilingual_train.py index 10e66b81..664de57e 100644 --- a/tests/tts_tests/test_vits_multilingual_train.py +++ b/tests/tts_tests/test_vits_multilingual_train.py @@ -3,7 +3,8 @@ import os import shutil from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs import BaseDatasetConfig, VitsConfig +from TTS.tts.configs.vits_config import VitsConfig +from TTS.config.shared_configs import BaseDatasetConfig config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") diff --git a/tests/tts_tests/test_vits_speaker_emb_train.py b/tests/tts_tests/test_vits_speaker_emb_train.py index 7028a983..6cc1dabd 100644 --- a/tests/tts_tests/test_vits_speaker_emb_train.py +++ b/tests/tts_tests/test_vits_speaker_emb_train.py @@ -3,7 +3,7 @@ import os import shutil from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs import VitsConfig +from TTS.tts.configs.vits_config import VitsConfig config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") From 20ac31dc71ad02392a421c7eb323b4d7e52eebbc Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Fri, 29 Oct 2021 19:05:26 +0200 Subject: [PATCH 040/220] fix f0_cache_path in dataset --- TTS/tts/datasets/dataset.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index c2818897..fc51c766 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -56,6 +56,10 @@ class TTSDataset(Dataset): meta_data (list): List of dataset instances. + compute_f0 (bool): compute f0 if True. Defaults to False. + + f0_cache_path (str): Path to store f0 cache. Defaults to None. + characters (dict): `dict` of custom text characters used for converting texts to sequences. custom_symbols (list): List of custom symbols used for converting texts to sequences. Models using its own @@ -106,6 +110,7 @@ class TTSDataset(Dataset): self.compute_linear_spec = compute_linear_spec self.return_wav = return_wav self.compute_f0 = compute_f0 + self.f0_cache_path = f0_cache_path self.min_seq_len = min_seq_len self.max_seq_len = max_seq_len self.ap = ap From 88d6399e12e52a0a547957d53bbcd29ef20b8ebc Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 2 Nov 2021 16:33:40 +0100 Subject: [PATCH 041/220] fix test sentence synthesis --- TTS/tts/utils/synthesis.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 63fe92c3..6d998492 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -175,6 +175,7 @@ def embedding_to_torch(d_vector, cuda=False): if d_vector is not None: d_vector = np.asarray(d_vector) d_vector = torch.from_numpy(d_vector).type(torch.FloatTensor) + d_vector = d_vector.squeeze().unsqueeze(0) if cuda: return d_vector.cuda() return d_vector From e1bdeacd2ef46934a09d54cac901245d3db52977 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 2 Nov 2021 17:30:20 +0100 Subject: [PATCH 042/220] Add torchaudio in requirements.txt --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 140cf743..cf4798b2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,4 +26,5 @@ unidic-lite==1.0.8 gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=2.0.0 fsspec>=2021.04.0 pyworld -webrtcvad \ No newline at end of file +webrtcvad +torchaudio>=0.7 From 3e9ca4b95da85a7eaaefeeb953995665fbd1f2b6 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 2 Nov 2021 17:31:14 +0100 Subject: [PATCH 043/220] make style --- TTS/bin/find_unique_phonemes.py | 14 ++-- TTS/bin/remove_silence_using_vad.py | 71 +++++++++++-------- TTS/speaker_encoder/models/resnet.py | 21 ++++-- TTS/speaker_encoder/utils/generic_utils.py | 6 +- TTS/trainer.py | 4 +- TTS/tts/datasets/dataset.py | 3 +- TTS/tts/datasets/formatters.py | 2 +- TTS/tts/layers/losses.py | 6 +- .../vits/stochastic_duration_predictor.py | 9 ++- TTS/tts/models/base_tts.py | 8 ++- TTS/tts/models/vits.py | 61 ++++++++++------ TTS/tts/utils/languages.py | 18 +++-- TTS/tts/utils/speakers.py | 5 +- TTS/tts/utils/text/cleaners.py | 3 +- .../tts_tests/test_vits_multilingual_train.py | 14 +++- 15 files changed, 158 insertions(+), 87 deletions(-) diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index 7ed79b36..bbc88fb6 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -1,14 +1,15 @@ """Find all the unique characters in a dataset""" import argparse +import multiprocessing from argparse import RawTextHelpFormatter +import numpy +from tqdm.contrib.concurrent import process_map + from TTS.config import load_config from TTS.tts.datasets import load_meta_data - -import numpy -import multiprocessing from TTS.tts.utils.text import text2phone -from tqdm.contrib.concurrent import process_map + def compute_phonemes(item): try: @@ -18,7 +19,8 @@ def compute_phonemes(item): except: return [] return list(set(ph)) - + + def main(): global c # pylint: disable=bad-option-value @@ -51,8 +53,6 @@ def main(): phones_force_lower = [c.lower() for c in phones] phones_force_lower = set(phones_force_lower) - - print(f" > Number of unique phonemes: {len(phones)}") print(f" > Unique phonemes: {''.join(sorted(phones))}") print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}") diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py index c7541cc8..25ae26ef 100755 --- a/TTS/bin/remove_silence_using_vad.py +++ b/TTS/bin/remove_silence_using_vad.py @@ -1,26 +1,27 @@ # This code is adpated from: https://github.com/wiseman/py-webrtcvad/blob/master/example.py -import os -import tqdm -import glob import argparse -import pathlib - import collections import contextlib +import glob +import multiprocessing +import os +import pathlib import sys import wave +from itertools import chain + import numpy as np +import tqdm import webrtcvad from tqdm.contrib.concurrent import process_map -import multiprocessing -from itertools import chain + def read_wave(path): """Reads a .wav file. Takes the path, and returns (PCM audio data, sample rate). """ - with contextlib.closing(wave.open(path, 'rb')) as wf: + with contextlib.closing(wave.open(path, "rb")) as wf: num_channels = wf.getnchannels() assert num_channels == 1 sample_width = wf.getsampwidth() @@ -36,7 +37,7 @@ def write_wave(path, audio, sample_rate): Takes path, PCM audio data, and sample rate. """ - with contextlib.closing(wave.open(path, 'wb')) as wf: + with contextlib.closing(wave.open(path, "wb")) as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(sample_rate) @@ -45,6 +46,7 @@ def write_wave(path, audio, sample_rate): class Frame(object): """Represents a "frame" of audio data.""" + def __init__(self, bytes, timestamp, duration): self.bytes = bytes self.timestamp = timestamp @@ -64,13 +66,12 @@ def frame_generator(frame_duration_ms, audio, sample_rate): timestamp = 0.0 duration = (float(n) / sample_rate) / 2.0 while offset + n < len(audio): - yield Frame(audio[offset:offset + n], timestamp, duration) + yield Frame(audio[offset : offset + n], timestamp, duration) timestamp += duration offset += n -def vad_collector(sample_rate, frame_duration_ms, - padding_duration_ms, vad, frames): +def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames): """Filters out non-voiced audio frames. Given a webrtcvad.Vad and a source of audio frames, yields only @@ -133,25 +134,26 @@ def vad_collector(sample_rate, frame_duration_ms, # unvoiced, then enter NOTTRIGGERED and yield whatever # audio we've collected. if num_unvoiced > 0.9 * ring_buffer.maxlen: - #sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration)) + # sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration)) triggered = False - yield b''.join([f.bytes for f in voiced_frames]) + yield b"".join([f.bytes for f in voiced_frames]) ring_buffer.clear() voiced_frames = [] # If we have any leftover voiced audio when we run out of input, # yield it. if voiced_frames: - yield b''.join([f.bytes for f in voiced_frames]) + yield b"".join([f.bytes for f in voiced_frames]) + def remove_silence(filepath): filename = os.path.basename(filepath) - output_path = filepath.replace(os.path.join(args.input_dir, ''),os.path.join(args.output_dir, '')) - # ignore if the file exists + output_path = filepath.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, "")) + # ignore if the file exists if os.path.exists(output_path) and not args.force: return False # create all directory structure pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True) - padding_duration_ms = 300 # default 300 + padding_duration_ms = 300 # default 300 audio, sample_rate = read_wave(filepath) vad = webrtcvad.Vad(int(args.aggressiveness)) frames = frame_generator(30, audio, sample_rate) @@ -180,6 +182,7 @@ def remove_silence(filepath): # if fail to remove silence just write the file write_wave(output_path, audio, sample_rate) + def preprocess_audios(): files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True)) print("> Number of files: ", len(files)) @@ -193,21 +196,31 @@ def preprocess_audios(): else: print("> No files Found !") + if __name__ == "__main__": """ usage - python remove_silence.py -i=VCTK-Corpus-bk/ -o=../VCTK-Corpus-removed-silence -g=wav48/*/*.wav -a=2 + python remove_silence.py -i=VCTK-Corpus-bk/ -o=../VCTK-Corpus-removed-silence -g=wav48/*/*.wav -a=2 """ parser = argparse.ArgumentParser() - parser.add_argument('-i', '--input_dir', type=str, default='../VCTK-Corpus', - help='Dataset root dir') - parser.add_argument('-o', '--output_dir', type=str, default='../VCTK-Corpus-removed-silence', - help='Output Dataset dir') - parser.add_argument('-f', '--force', type=bool, default=True, - help='Force the replace of exists files') - parser.add_argument('-g', '--glob', type=str, default='**/*.wav', - help='path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav') - parser.add_argument('-a', '--aggressiveness', type=int, default=2, - help='set its aggressiveness mode, which is an integer between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive.') + parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir") + parser.add_argument( + "-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir" + ) + parser.add_argument("-f", "--force", type=bool, default=True, help="Force the replace of exists files") + parser.add_argument( + "-g", + "--glob", + type=str, + default="**/*.wav", + help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav", + ) + parser.add_argument( + "-a", + "--aggressiveness", + type=int, + default=2, + help="set its aggressiveness mode, which is an integer between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive.", + ) args = parser.parse_args() preprocess_audios() diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index beeb5ae1..42f041b4 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -5,20 +5,20 @@ import torch.nn as nn from TTS.utils.io import load_fsspec + class PreEmphasis(torch.nn.Module): def __init__(self, coefficient=0.97): super().__init__() self.coefficient = coefficient - self.register_buffer( - 'filter', torch.FloatTensor([-self.coefficient, 1.]).unsqueeze(0).unsqueeze(0) - ) + self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0)) def forward(self, x): assert len(x.size()) == 2 - x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), 'reflect') + x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect") return torch.nn.functional.conv1d(x, self.filter).squeeze(1) + class SELayer(nn.Module): def __init__(self, channel, reduction=8): super(SELayer, self).__init__() @@ -110,8 +110,15 @@ class ResNetSpeakerEncoder(nn.Module): if self.use_torch_spec: self.torch_spec = torch.nn.Sequential( PreEmphasis(audio_config["preemphasis"]), - torchaudio.transforms.MelSpectrogram(sample_rate=audio_config["sample_rate"], n_fft=audio_config["fft_size"], win_length=audio_config["win_length"], hop_length=audio_config["hop_length"], window_fn=torch.hamming_window, n_mels=audio_config["num_mels"]) - ) + torchaudio.transforms.MelSpectrogram( + sample_rate=audio_config["sample_rate"], + n_fft=audio_config["fft_size"], + win_length=audio_config["win_length"], + hop_length=audio_config["hop_length"], + window_fn=torch.hamming_window, + n_mels=audio_config["num_mels"], + ), + ) else: self.torch_spec = None @@ -213,7 +220,7 @@ class ResNetSpeakerEncoder(nn.Module): """ # map to the waveform size if self.use_torch_spec: - num_frames = num_frames * self.audio_config['hop_length'] + num_frames = num_frames * self.audio_config["hop_length"] max_len = x.shape[1] diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/speaker_encoder/utils/generic_utils.py index 3714e3c4..c926e215 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/speaker_encoder/utils/generic_utils.py @@ -179,10 +179,12 @@ def setup_model(c): c.model_params["num_lstm_layers"], ) elif c.model_params["model_name"].lower() == "resnet": - model = ResNetSpeakerEncoder(input_dim=c.model_params["input_dim"], proj_dim=c.model_params["proj_dim"], + model = ResNetSpeakerEncoder( + input_dim=c.model_params["input_dim"], + proj_dim=c.model_params["proj_dim"], log_input=c.model_params.get("log_input", False), use_torch_spec=c.model_params.get("use_torch_spec", False), - audio_config=c.audio + audio_config=c.audio, ) return model diff --git a/TTS/trainer.py b/TTS/trainer.py index 665f2589..c151e716 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -265,7 +265,9 @@ class Trainer: config = self.config.model_args if hasattr(self.config, "model_args") else self.config # save speakers json if config.use_language_embedding and self.model.language_manager.num_languages > 1: - self.model.language_manager.save_language_ids_to_file(os.path.join(self.output_path, "language_ids.json")) + self.model.language_manager.save_language_ids_to_file( + os.path.join(self.output_path, "language_ids.json") + ) if hasattr(self.config, "model_args"): self.config.model_args["num_languages"] = self.model.language_manager.num_languages else: diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index fc51c766..6d177743 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -542,6 +542,7 @@ class TTSDataset(Dataset): ) ) + class PitchExtractor: """Pitch Extractor for computing F0 from wav files. Args: @@ -645,4 +646,4 @@ class PitchExtractor: stats_path = os.path.join(cache_path, "pitch_stats.npy") stats = np.load(stats_path, allow_pickle=True).item() self.mean = stats["mean"].astype(np.float32) - self.std = stats["std"].astype(np.float32) \ No newline at end of file + self.std = stats["std"].astype(np.float32) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 651b3197..7e65f21a 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -304,7 +304,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): return items -def vctk_slim(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): # pylint: disable=unused-argument +def vctk_slim(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): # pylint: disable=unused-argument """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" items = [] txt_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index cd2903b0..93a5bad2 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -602,7 +602,7 @@ class VitsGeneratorLoss(nn.Module): fine_tuning_mode=0, use_speaker_encoder_as_loss=False, gt_spk_emb=None, - syn_spk_emb=None + syn_spk_emb=None, ): """ Shapes: @@ -638,7 +638,9 @@ class VitsGeneratorLoss(nn.Module): loss = loss_kl + loss_feat + loss_mel + loss_gen + loss_duration if use_speaker_encoder_as_loss: - loss_se = - torch.nn.functional.cosine_similarity(gt_spk_emb, syn_spk_emb).mean() * self.spk_encoder_loss_alpha + loss_se = ( + -torch.nn.functional.cosine_similarity(gt_spk_emb, syn_spk_emb).mean() * self.spk_encoder_loss_alpha + ) loss += loss_se return_dict["loss_spk_encoder"] = loss_se diff --git a/TTS/tts/layers/vits/stochastic_duration_predictor.py b/TTS/tts/layers/vits/stochastic_duration_predictor.py index 8ec7c866..7c25156a 100644 --- a/TTS/tts/layers/vits/stochastic_duration_predictor.py +++ b/TTS/tts/layers/vits/stochastic_duration_predictor.py @@ -178,7 +178,14 @@ class StochasticDurationPredictor(nn.Module): """ def __init__( - self, in_channels: int, hidden_channels: int, kernel_size: int, dropout_p: float, num_flows=4, cond_channels=0, language_emb_dim=None + self, + in_channels: int, + hidden_channels: int, + kernel_size: int, + dropout_p: float, + num_flows=4, + cond_channels=0, + language_emb_dim=None, ): super().__init__() diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index df6c52f3..de00f6c7 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -246,7 +246,9 @@ class BaseTTS(BaseModel): # setup multi-speaker attributes if hasattr(self, "speaker_manager") and self.speaker_manager is not None: if hasattr(config, "model_args"): - speaker_id_mapping = self.speaker_manager.speaker_ids if config.model_args.use_speaker_embedding else None + speaker_id_mapping = ( + self.speaker_manager.speaker_ids if config.model_args.use_speaker_embedding else None + ) d_vector_mapping = self.speaker_manager.d_vectors if config.model_args.use_d_vector_file else None config.use_d_vector_file = config.model_args.use_d_vector_file else: @@ -262,7 +264,9 @@ class BaseTTS(BaseModel): custom_symbols = self.make_symbols(self.config) if hasattr(self, "language_manager"): - language_id_mapping = self.language_manager.language_id_mapping if self.args.use_language_embedding else None + language_id_mapping = ( + self.language_manager.language_id_mapping if self.args.use_language_embedding else None + ) else: language_id_mapping = None diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index bc503cb5..c185150b 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -229,7 +229,6 @@ class VitsArgs(Coqpit): freeze_waveform_decoder: bool = False - class Vits(BaseTTS): """VITS TTS model @@ -306,7 +305,7 @@ class Vits(BaseTTS): args.num_layers_text_encoder, args.kernel_size_text_encoder, args.dropout_p_text_encoder, - language_emb_dim=self.embedded_language_dim + language_emb_dim=self.embedded_language_dim, ) self.posterior_encoder = PosteriorEncoder( @@ -389,16 +388,26 @@ class Vits(BaseTTS): # TODO: make this a function if config.use_speaker_encoder_as_loss: if not config.speaker_encoder_model_path or not config.speaker_encoder_config_path: - raise RuntimeError(" [!] To use the speaker encoder loss you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!") - self.speaker_manager.init_speaker_encoder(config.speaker_encoder_model_path, config.speaker_encoder_config_path) + raise RuntimeError( + " [!] To use the speaker encoder loss you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!" + ) + self.speaker_manager.init_speaker_encoder( + config.speaker_encoder_model_path, config.speaker_encoder_config_path + ) self.speaker_encoder = self.speaker_manager.speaker_encoder.train() for param in self.speaker_encoder.parameters(): param.requires_grad = False print(" > External Speaker Encoder Loaded !!") - if hasattr(self.speaker_encoder, "audio_config") and self.audio_config["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"]: - self.audio_transform = torchaudio.transforms.Resample(orig_freq=self.audio_config["sample_rate"], new_freq=self.speaker_encoder.audio_config["sample_rate"]) + if ( + hasattr(self.speaker_encoder, "audio_config") + and self.audio_config["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"] + ): + self.audio_transform = torchaudio.transforms.Resample( + orig_freq=self.audio_config["sample_rate"], + new_freq=self.speaker_encoder.audio_config["sample_rate"], + ) else: self.audio_transform = None else: @@ -529,7 +538,13 @@ class Vits(BaseTTS): if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None: language_id = self.language_manager.language_id_mapping[language_name] - return {"text": text, "speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector, "language_id": language_id} + return { + "text": text, + "speaker_id": speaker_id, + "style_wav": style_wav, + "d_vector": d_vector, + "language_id": language_id, + } def forward( self, @@ -567,7 +582,7 @@ class Vits(BaseTTS): g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] # language embedding - lang_emb=None + lang_emb = None if self.args.use_language_embedding and lid is not None: lang_emb = self.emb_l(lid).unsqueeze(-1) @@ -621,9 +636,9 @@ class Vits(BaseTTS): o = self.waveform_decoder(z_slice, g=g) wav_seg = segment( - waveform.transpose(1, 2), - slice_ids * self.config.audio.hop_length, - self.args.spec_segment_size * self.config.audio.hop_length, + waveform.transpose(1, 2), + slice_ids * self.config.audio.hop_length, + self.args.spec_segment_size * self.config.audio.hop_length, ) if self.args.use_speaker_encoder_as_loss and self.speaker_encoder is not None: @@ -653,7 +668,7 @@ class Vits(BaseTTS): "logs_q": logs_q, "waveform_seg": wav_seg, "gt_spk_emb": gt_spk_emb, - "syn_spk_emb": syn_spk_emb + "syn_spk_emb": syn_spk_emb, } ) return outputs @@ -695,7 +710,7 @@ class Vits(BaseTTS): g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] # language embedding - lang_emb=None + lang_emb = None if self.args.use_language_embedding and lid is not None: lang_emb = self.emb_l(lid).unsqueeze(-1) @@ -737,9 +752,9 @@ class Vits(BaseTTS): o = self.waveform_decoder(z_slice, g=g) wav_seg = segment( - waveform.transpose(1, 2), - slice_ids * self.config.audio.hop_length, - self.args.spec_segment_size * self.config.audio.hop_length, + waveform.transpose(1, 2), + slice_ids * self.config.audio.hop_length, + self.args.spec_segment_size * self.config.audio.hop_length, ) if self.args.use_speaker_encoder_as_loss and self.speaker_encoder is not None: @@ -770,7 +785,7 @@ class Vits(BaseTTS): "logs_q": logs_q, "waveform_seg": wav_seg, "gt_spk_emb": gt_spk_emb, - "syn_spk_emb": syn_spk_emb + "syn_spk_emb": syn_spk_emb, } ) return outputs @@ -790,14 +805,16 @@ class Vits(BaseTTS): g = self.emb_g(sid).unsqueeze(-1) # language embedding - lang_emb=None + lang_emb = None if self.args.use_language_embedding and lid is not None: lang_emb = self.emb_l(lid).unsqueeze(-1) x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb) if self.args.use_sdp: - logw = self.duration_predictor(x, x_mask, g=g, reverse=True, noise_scale=self.inference_noise_scale_dp, lang_emb=lang_emb) + logw = self.duration_predictor( + x, x_mask, g=g, reverse=True, noise_scale=self.inference_noise_scale_dp, lang_emb=lang_emb + ) else: logw = self.duration_predictor(x, x_mask, g=g, lang_emb=lang_emb) @@ -866,7 +883,7 @@ class Vits(BaseTTS): for param in self.text_encoder.parameters(): param.requires_grad = False - if hasattr(self, 'emb_l'): + if hasattr(self, "emb_l"): for param in self.emb_l.parameters(): param.requires_grad = False @@ -932,7 +949,7 @@ class Vits(BaseTTS): with autocast(enabled=False): # use float32 for the criterion loss_dict = criterion[optimizer_idx]( waveform_hat=outputs["model_outputs"].float(), - waveform= outputs["waveform_seg"].float(), + waveform=outputs["waveform_seg"].float(), z_p=outputs["z_p"].float(), logs_q=outputs["logs_q"].float(), m_p=outputs["m_p"].float(), @@ -945,7 +962,7 @@ class Vits(BaseTTS): fine_tuning_mode=self.args.fine_tuning_mode, use_speaker_encoder_as_loss=self.args.use_speaker_encoder_as_loss, gt_spk_emb=outputs["gt_spk_emb"], - syn_spk_emb=outputs["syn_spk_emb"] + syn_spk_emb=outputs["syn_spk_emb"], ) # ignore duration loss if fine tuning mode is on if not self.args.fine_tuning_mode: diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py index 94be914c..5bacc259 100644 --- a/TTS/tts/utils/languages.py +++ b/TTS/tts/utils/languages.py @@ -1,13 +1,14 @@ -import os import json -import torch +import os +from typing import Dict, List, Tuple + import fsspec import numpy as np -from typing import Dict, Tuple, List +import torch from coqpit import Coqpit - from torch.utils.data.sampler import WeightedRandomSampler + class LanguageManager: """Manage the languages for multi-lingual 🐸TTS models. Load a datafile and parse the information in a way that can be queried by language. @@ -20,7 +21,9 @@ class LanguageManager: >>> manager = LanguageManager(language_id_file_path=language_id_file_path) >>> language_id_mapper = manager.language_ids """ + language_id_mapping: Dict = {} + def __init__( self, language_id_file_path: str = "", @@ -85,6 +88,7 @@ class LanguageManager: """ self._save_json(file_path, self.language_id_mapping) + def _set_file_path(path): """Find the language_ids.json under the given path or the above it. Intended to band aid the different paths returned in restored and continued training.""" @@ -97,6 +101,7 @@ def _set_file_path(path): return path_continue return None + def get_language_manager(c: Coqpit, data: List = None, restore_path: str = None) -> LanguageManager: """Initiate a `LanguageManager` instance by the provided config. @@ -118,7 +123,7 @@ def get_language_manager(c: Coqpit, data: List = None, restore_path: str = None) # restoring language manager from a previous run. if language_file: language_manager.set_language_ids_from_file(language_file) - if language_manager.num_languages > 0: + if language_manager.num_languages > 0: print( " > Language manager is loaded with {} languages: {}".format( language_manager.num_languages, ", ".join(language_manager.language_names) @@ -126,11 +131,12 @@ def get_language_manager(c: Coqpit, data: List = None, restore_path: str = None) ) return language_manager + def get_language_weighted_sampler(items: list): language_names = np.array([item[3] for item in items]) unique_language_names = np.unique(language_names).tolist() language_ids = [unique_language_names.index(l) for l in language_names] language_count = np.array([len(np.where(language_names == l)[0]) for l in unique_language_names]) - weight_language = 1. / language_count + weight_language = 1.0 / language_count dataset_samples_weight = torch.from_numpy(np.array([weight_language[l] for l in language_ids])).double() return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight)) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 8ccbdafc..d6381a70 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -432,11 +432,12 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, speaker_manager.save_speaker_ids_to_file(out_file_path) return speaker_manager + def get_speaker_weighted_sampler(items: list): speaker_names = np.array([item[2] for item in items]) unique_speaker_names = np.unique(speaker_names).tolist() speaker_ids = [unique_speaker_names.index(l) for l in speaker_names] speaker_count = np.array([len(np.where(speaker_names == l)[0]) for l in unique_speaker_names]) - weight_speaker = 1. / speaker_count + weight_speaker = 1.0 / speaker_count dataset_samples_weight = torch.from_numpy(np.array([weight_speaker[l] for l in speaker_ids])).double() - return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight)) \ No newline at end of file + return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight)) diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 826919c2..f3ffa478 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -136,8 +136,9 @@ def phoneme_cleaners(text): text = collapse_whitespace(text) return text + def multilingual_cleaners(text): - '''Pipeline for multilingual text''' + """Pipeline for multilingual text""" text = lowercase(text) text = replace_symbols(text, lang=None) text = remove_aux_symbols(text) diff --git a/tests/tts_tests/test_vits_multilingual_train.py b/tests/tts_tests/test_vits_multilingual_train.py index 664de57e..04b42e61 100644 --- a/tests/tts_tests/test_vits_multilingual_train.py +++ b/tests/tts_tests/test_vits_multilingual_train.py @@ -3,19 +3,27 @@ import os import shutil from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.vits_config import VitsConfig from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.configs.vits_config import VitsConfig config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") dataset_config1 = BaseDatasetConfig( - name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", path="tests/data/ljspeech", language="en" + name="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="en", ) dataset_config2 = BaseDatasetConfig( - name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", path="tests/data/ljspeech", language="en2" + name="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="en2", ) config = VitsConfig( From 215a74b32ea246cf400b759f6e79b9f8196082f2 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 2 Nov 2021 19:10:18 +0100 Subject: [PATCH 044/220] fix linter --- TTS/bin/find_unique_phonemes.py | 1 - TTS/bin/remove_silence_using_vad.py | 27 ++++++++++----------------- TTS/tts/datasets/formatters.py | 2 +- TTS/tts/models/vits.py | 3 +-- TTS/tts/utils/speakers.py | 1 + notebooks/dataset_analysis/analyze.py | 2 +- 6 files changed, 14 insertions(+), 22 deletions(-) diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index bbc88fb6..ffad6891 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -3,7 +3,6 @@ import argparse import multiprocessing from argparse import RawTextHelpFormatter -import numpy from tqdm.contrib.concurrent import process_map from TTS.config import load_config diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py index 25ae26ef..8951662b 100755 --- a/TTS/bin/remove_silence_using_vad.py +++ b/TTS/bin/remove_silence_using_vad.py @@ -6,12 +6,7 @@ import glob import multiprocessing import os import pathlib -import sys import wave -from itertools import chain - -import numpy as np -import tqdm import webrtcvad from tqdm.contrib.concurrent import process_map @@ -47,8 +42,8 @@ def write_wave(path, audio, sample_rate): class Frame(object): """Represents a "frame" of audio data.""" - def __init__(self, bytes, timestamp, duration): - self.bytes = bytes + def __init__(self, _bytes, timestamp, duration): + self.bytes =_bytes self.timestamp = timestamp self.duration = duration @@ -121,7 +116,7 @@ def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, fram # We want to yield all the audio we see from now until # we are NOTTRIGGERED, but we have to start with the # audio that's already in the ring buffer. - for f, s in ring_buffer: + for f, _ in ring_buffer: voiced_frames.append(f) ring_buffer.clear() else: @@ -146,11 +141,10 @@ def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, fram def remove_silence(filepath): - filename = os.path.basename(filepath) output_path = filepath.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, "")) # ignore if the file exists if os.path.exists(output_path) and not args.force: - return False + return # create all directory structure pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True) padding_duration_ms = 300 # default 300 @@ -166,7 +160,7 @@ def remove_silence(filepath): if num_segments != 0: for i, segment in reversed(list(enumerate(segments))): if i >= 1: - if flag == False: + if not flag: concat_segment = segment flag = True else: @@ -176,11 +170,12 @@ def remove_silence(filepath): segment = segment + concat_segment write_wave(output_path, segment, sample_rate) print(output_path) - return True + return else: print("> Just Copying the file to:", output_path) # if fail to remove silence just write the file write_wave(output_path, audio, sample_rate) + return def preprocess_audios(): @@ -198,11 +193,9 @@ def preprocess_audios(): if __name__ == "__main__": - """ - usage - python remove_silence.py -i=VCTK-Corpus-bk/ -o=../VCTK-Corpus-removed-silence -g=wav48/*/*.wav -a=2 - """ - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser( + description="python remove_silence.py -i=VCTK-Corpus-bk/ -o=../VCTK-Corpus-removed-silence -g=wav48/*/*.wav -a=2" + ) parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir") parser.add_argument( "-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir" diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 7e65f21a..49a1ced4 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -59,7 +59,7 @@ def mozilla_de(root_path, meta_file, **kwargs): # pylint: disable=unused-argume return items -def mailabs(root_path, meta_files=None): +def mailabs(root_path, meta_files=None, ununsed_speakers=None): """Normalizes M-AI-Labs meta data files to TTS format Args: diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index c185150b..94d5bfc9 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -1,5 +1,4 @@ import math -import random from dataclasses import dataclass, field from itertools import chain from typing import Dict, List, Tuple @@ -747,7 +746,7 @@ class Vits(BaseTTS): # inverse decoder and get the output z_f_pred = self.flow(z_ft, y_mask, g=g, reverse=True) - z_slice, slice_ids = rand_segment(z_f_pred, y_lengths, self.spec_segment_size) + z_slice, slice_ids = rand_segments(z_f_pred, y_lengths, self.spec_segment_size) o = self.waveform_decoder(z_slice, g=g) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index d6381a70..8c248658 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -7,6 +7,7 @@ import fsspec import numpy as np import torch from coqpit import Coqpit +from torch.utils.data.sampler import WeightedRandomSampler from TTS.config import load_config from TTS.speaker_encoder.utils.generic_utils import setup_model diff --git a/notebooks/dataset_analysis/analyze.py b/notebooks/dataset_analysis/analyze.py index 9ba42fb9..4855886e 100644 --- a/notebooks/dataset_analysis/analyze.py +++ b/notebooks/dataset_analysis/analyze.py @@ -180,7 +180,7 @@ def plot_phonemes(train_path, cmu_dict_path, save_path): plt.figure() plt.rcParams["figure.figsize"] = (50, 20) - barplot = sns.barplot(x, y) + barplot = sns.barplot(x=x, y=y) if save_path: fig = barplot.get_figure() fig.savefig(os.path.join(save_path, "phoneme_dist")) From 686c7381e2772ff3163643a4d8ede3626161679b Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Thu, 4 Nov 2021 16:36:11 +0100 Subject: [PATCH 045/220] fix phonemes per language --- TTS/tts/datasets/dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 6d177743..513f2b12 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -273,6 +273,7 @@ class TTSDataset(Dataset): item = args[0] func_args = args[1] text, wav_file, *_ = item + func_args[3] = item[4] phonemes = TTSDataset._load_or_generate_phoneme_sequence(wav_file, text, *func_args) return phonemes From b1df118b8126cc6f7de3f3a3caf4ae65b26e80cb Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Thu, 4 Nov 2021 16:36:40 +0100 Subject: [PATCH 046/220] fix imports for load_meta_data --- TTS/bin/find_unique_phonemes.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index ffad6891..ff7eac46 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -6,7 +6,7 @@ from argparse import RawTextHelpFormatter from tqdm.contrib.concurrent import process_map from TTS.config import load_config -from TTS.tts.datasets import load_meta_data +from TTS.tts.datasets import load_tts_samples from TTS.tts.utils.text import text2phone @@ -38,10 +38,9 @@ def main(): c = load_config(args.config_path) # load all datasets - train_items, eval_items = load_meta_data(c.datasets, eval_split=True) + train_items, eval_items = load_tts_samples(c.datasets, eval_split=True) items = train_items + eval_items print("Num items:", len(items)) - # items = items[:1000] phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15) phones = [] From 61251bd86caef026d742e31403aa68b6bfce57cf Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Sat, 6 Nov 2021 00:27:58 +0100 Subject: [PATCH 047/220] Fix phonemes --- TTS/bin/find_unique_phonemes.py | 2 +- TTS/tts/datasets/dataset.py | 2 +- TTS/tts/models/vits.py | 2 ++ TTS/tts/utils/synthesis.py | 7 ++++--- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index ff7eac46..a869df27 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -7,7 +7,7 @@ from tqdm.contrib.concurrent import process_map from TTS.config import load_config from TTS.tts.datasets import load_tts_samples -from TTS.tts.utils.text import text2phone +from TTS.tts.utils.text import text2phone, phoneme_to_sequence def compute_phonemes(item): diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 513f2b12..38db31c3 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -273,7 +273,7 @@ class TTSDataset(Dataset): item = args[0] func_args = args[1] text, wav_file, *_ = item - func_args[3] = item[4] + func_args[3] = item[3] phonemes = TTSDataset._load_or_generate_phoneme_sequence(wav_file, text, *func_args) return phonemes diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 94d5bfc9..09537905 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -543,6 +543,7 @@ class Vits(BaseTTS): "style_wav": style_wav, "d_vector": d_vector, "language_id": language_id, + "language_name": language_name, } def forward( @@ -1061,6 +1062,7 @@ class Vits(BaseTTS): d_vector=aux_inputs["d_vector"], style_wav=aux_inputs["style_wav"], language_id=aux_inputs["language_id"], + language_name=aux_inputs["language_name"], enable_eos_bos_chars=self.config.enable_eos_bos_chars, use_griffin_lim=True, do_trim_silence=False, diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 6d998492..102914c5 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -15,7 +15,7 @@ if "tensorflow" in installed or "tensorflow-gpu" in installed: import tensorflow as tf -def text_to_seq(text, CONFIG, custom_symbols=None): +def text_to_seq(text, CONFIG, custom_symbols=None, language=None): text_cleaner = [CONFIG.text_cleaner] # text ot phonemes to sequence vector if CONFIG.use_phonemes: @@ -23,7 +23,7 @@ def text_to_seq(text, CONFIG, custom_symbols=None): phoneme_to_sequence( text, text_cleaner, - CONFIG.phoneme_language, + language if language else CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.characters, add_blank=CONFIG.add_blank, @@ -212,6 +212,7 @@ def synthesis( do_trim_silence=False, d_vector=None, language_id=None, + language_name=None, backend="torch", ): """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to @@ -262,7 +263,7 @@ def synthesis( if hasattr(model, "make_symbols"): custom_symbols = model.make_symbols(CONFIG) # preprocess the given text - text_inputs = text_to_seq(text, CONFIG, custom_symbols=custom_symbols) + text_inputs = text_to_seq(text, CONFIG, custom_symbols=custom_symbols, language=language_name) # pass tensors to backend if backend == "torch": if speaker_id is not None: From 5f40e960102df2eb170776b99f7583e4e66399d7 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 9 Nov 2021 12:20:11 +0100 Subject: [PATCH 048/220] Fix continue path --- TTS/bin/train_tts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index e28e9dec..2d7bd68f 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -9,6 +9,7 @@ from TTS.utils.audio import AudioProcessor def main(): + #os.environ["CUDA_VISIBLE_DEVICES"]="" """Run `tts` model training directly by a `config.json` file.""" # init trainer args train_args = TrainingArgs() @@ -64,7 +65,7 @@ def main(): train_samples=train_samples, eval_samples=eval_samples, training_assets={"audio_processor": ap}, - parse_command_line_args=False, + parse_command_line_args=True, ) trainer.fit() From e04577575e754c0e8c77a5b9ff2585eaed1d5cdb Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 9 Nov 2021 12:20:43 +0100 Subject: [PATCH 049/220] Fix use_speaker_embedding logic --- TTS/tts/models/base_tts.py | 2 +- TTS/tts/models/vits.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index de00f6c7..707fc9c3 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -102,7 +102,7 @@ class BaseTTS(BaseModel): config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512 ) # init speaker embedding layer - if config.use_speaker_embedding and not config.use_d_vector_file: + if config.use_speaker_embedding: print(" > Init speaker_embedding layer.") self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 09537905..4d47cde1 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -578,7 +578,7 @@ class Vits(BaseTTS): outputs = {} sid, g, lid = self._set_cond_input(aux_input) # speaker embedding - if self.args.use_speaker_embedding and sid is not None and not self.use_d_vector: + if self.args.use_speaker_embedding and sid is not None: g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] # language embedding @@ -801,7 +801,7 @@ class Vits(BaseTTS): x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # speaker embedding - if self.args.use_speaker_embedding and sid is not None and not self.use_d_vector: + if self.args.use_speaker_embedding and sid is not None: g = self.emb_g(sid).unsqueeze(-1) # language embedding From 868cf6424f2fd9aa5d81a5d9cf5bf260aaaa5028 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 15 Nov 2021 22:09:59 +0100 Subject: [PATCH 050/220] Fix small issues --- TTS/bin/train_tts.py | 1 - TTS/tts/configs/vits_config.py | 2 +- TTS/tts/models/vits.py | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 2d7bd68f..1a9faf02 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -9,7 +9,6 @@ from TTS.utils.audio import AudioProcessor def main(): - #os.environ["CUDA_VISIBLE_DEVICES"]="" """Run `tts` model training directly by a `config.json` file.""" # init trainer args train_args = TrainingArgs() diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index a6f2210d..eeb74bbe 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -150,7 +150,7 @@ class VitsConfig(BaseTTSConfig): # use d-vectors use_d_vector_file: bool = False - d_vector_file: str = False + d_vector_file: str = None d_vector_dim: int = None def __post_init__(self): diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 4d47cde1..9f895fc1 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -211,7 +211,6 @@ class VitsArgs(Coqpit): d_vector_file: str = None speaker_embedding_channels: int = 256 use_d_vector_file: bool = False - d_vector_file: str = None d_vector_dim: int = 0 detach_dp_input: bool = True use_language_embedding: bool = False From 390096fe0f0659d4caec15d0a9c042259ad9612c Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Thu, 18 Nov 2021 00:17:42 +0100 Subject: [PATCH 051/220] fix d-vector --- TTS/tts/datasets/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 38db31c3..000393ea 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -415,7 +415,7 @@ class TTSDataset(Dataset): language_ids = None # get pre-computed d-vectors if self.d_vector_mapping is not None: - wav_files_names = [batch["wav_file_name"][idx] for idx in ids_sorted_decreasing] + wav_files_names = list(batch["wav_file_name"]) d_vectors = [self.d_vector_mapping[w]["embedding"] for w in wav_files_names] else: d_vectors = None From 89019d49a29e3f5f8c72ff474f6a9f6db6424ce8 Mon Sep 17 00:00:00 2001 From: Edresson Date: Sun, 21 Nov 2021 12:20:35 -0300 Subject: [PATCH 052/220] Create a module for the VAD script --- TTS/bin/remove_silence_using_vad.py | 164 +++------------------------- TTS/utils/vad.py | 142 ++++++++++++++++++++++++ 2 files changed, 160 insertions(+), 146 deletions(-) create mode 100644 TTS/utils/vad.py diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py index 8951662b..a32f0f45 100755 --- a/TTS/bin/remove_silence_using_vad.py +++ b/TTS/bin/remove_silence_using_vad.py @@ -1,162 +1,31 @@ -# This code is adpated from: https://github.com/wiseman/py-webrtcvad/blob/master/example.py -import argparse -import collections -import contextlib -import glob -import multiprocessing import os +import glob import pathlib -import wave -import webrtcvad +import argparse +import multiprocessing + from tqdm.contrib.concurrent import process_map - -def read_wave(path): - """Reads a .wav file. - - Takes the path, and returns (PCM audio data, sample rate). - """ - with contextlib.closing(wave.open(path, "rb")) as wf: - num_channels = wf.getnchannels() - assert num_channels == 1 - sample_width = wf.getsampwidth() - assert sample_width == 2 - sample_rate = wf.getframerate() - assert sample_rate in (8000, 16000, 32000, 48000) - pcm_data = wf.readframes(wf.getnframes()) - return pcm_data, sample_rate - - -def write_wave(path, audio, sample_rate): - """Writes a .wav file. - - Takes path, PCM audio data, and sample rate. - """ - with contextlib.closing(wave.open(path, "wb")) as wf: - wf.setnchannels(1) - wf.setsampwidth(2) - wf.setframerate(sample_rate) - wf.writeframes(audio) - - -class Frame(object): - """Represents a "frame" of audio data.""" - - def __init__(self, _bytes, timestamp, duration): - self.bytes =_bytes - self.timestamp = timestamp - self.duration = duration - - -def frame_generator(frame_duration_ms, audio, sample_rate): - """Generates audio frames from PCM audio data. - - Takes the desired frame duration in milliseconds, the PCM data, and - the sample rate. - - Yields Frames of the requested duration. - """ - n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) - offset = 0 - timestamp = 0.0 - duration = (float(n) / sample_rate) / 2.0 - while offset + n < len(audio): - yield Frame(audio[offset : offset + n], timestamp, duration) - timestamp += duration - offset += n - - -def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames): - """Filters out non-voiced audio frames. - - Given a webrtcvad.Vad and a source of audio frames, yields only - the voiced audio. - - Uses a padded, sliding window algorithm over the audio frames. - When more than 90% of the frames in the window are voiced (as - reported by the VAD), the collector triggers and begins yielding - audio frames. Then the collector waits until 90% of the frames in - the window are unvoiced to detrigger. - - The window is padded at the front and back to provide a small - amount of silence or the beginnings/endings of speech around the - voiced frames. - - Arguments: - - sample_rate - The audio sample rate, in Hz. - frame_duration_ms - The frame duration in milliseconds. - padding_duration_ms - The amount to pad the window, in milliseconds. - vad - An instance of webrtcvad.Vad. - frames - a source of audio frames (sequence or generator). - - Returns: A generator that yields PCM audio data. - """ - num_padding_frames = int(padding_duration_ms / frame_duration_ms) - # We use a deque for our sliding window/ring buffer. - ring_buffer = collections.deque(maxlen=num_padding_frames) - # We have two states: TRIGGERED and NOTTRIGGERED. We start in the - # NOTTRIGGERED state. - triggered = False - - voiced_frames = [] - for frame in frames: - is_speech = vad.is_speech(frame.bytes, sample_rate) - - # sys.stdout.write('1' if is_speech else '0') - if not triggered: - ring_buffer.append((frame, is_speech)) - num_voiced = len([f for f, speech in ring_buffer if speech]) - # If we're NOTTRIGGERED and more than 90% of the frames in - # the ring buffer are voiced frames, then enter the - # TRIGGERED state. - if num_voiced > 0.9 * ring_buffer.maxlen: - triggered = True - # sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,)) - # We want to yield all the audio we see from now until - # we are NOTTRIGGERED, but we have to start with the - # audio that's already in the ring buffer. - for f, _ in ring_buffer: - voiced_frames.append(f) - ring_buffer.clear() - else: - # We're in the TRIGGERED state, so collect the audio data - # and add it to the ring buffer. - voiced_frames.append(frame) - ring_buffer.append((frame, is_speech)) - num_unvoiced = len([f for f, speech in ring_buffer if not speech]) - # If more than 90% of the frames in the ring buffer are - # unvoiced, then enter NOTTRIGGERED and yield whatever - # audio we've collected. - if num_unvoiced > 0.9 * ring_buffer.maxlen: - # sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration)) - triggered = False - yield b"".join([f.bytes for f in voiced_frames]) - ring_buffer.clear() - voiced_frames = [] - # If we have any leftover voiced audio when we run out of input, - # yield it. - if voiced_frames: - yield b"".join([f.bytes for f in voiced_frames]) - +from TTS.utils.vad import read_wave, write_wave, get_vad_speech_segments def remove_silence(filepath): output_path = filepath.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, "")) # ignore if the file exists if os.path.exists(output_path) and not args.force: return + # create all directory structure pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True) - padding_duration_ms = 300 # default 300 + # load wave audio, sample_rate = read_wave(filepath) - vad = webrtcvad.Vad(int(args.aggressiveness)) - frames = frame_generator(30, audio, sample_rate) - frames = list(frames) - segments = vad_collector(sample_rate, 30, padding_duration_ms, vad, frames) - flag = False + + # get speech segments + segments = get_vad_speech_segments(audio, sample_rate, aggressiveness=args.aggressiveness) + segments = list(segments) num_segments = len(segments) - + flag = False + # create the output wave if num_segments != 0: for i, segment in reversed(list(enumerate(segments))): if i >= 1: @@ -168,8 +37,8 @@ def remove_silence(filepath): else: if flag: segment = segment + concat_segment + # print("Saving: ", output_path) write_wave(output_path, segment, sample_rate) - print(output_path) return else: print("> Just Copying the file to:", output_path) @@ -200,7 +69,10 @@ if __name__ == "__main__": parser.add_argument( "-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir" ) - parser.add_argument("-f", "--force", type=bool, default=True, help="Force the replace of exists files") + parser.add_argument("-f", "--force", + default=False, + action='store_true', + help='Force the replace of exists files') parser.add_argument( "-g", "--glob", diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py new file mode 100644 index 00000000..4e61f490 --- /dev/null +++ b/TTS/utils/vad.py @@ -0,0 +1,142 @@ +# This code is adpated from: https://github.com/wiseman/py-webrtcvad/blob/master/example.py +import wave +import webrtcvad +import contextlib +import collections + + +def read_wave(path): + """Reads a .wav file. + + Takes the path, and returns (PCM audio data, sample rate). + """ + with contextlib.closing(wave.open(path, "rb")) as wf: + num_channels = wf.getnchannels() + assert num_channels == 1 + sample_width = wf.getsampwidth() + assert sample_width == 2 + sample_rate = wf.getframerate() + assert sample_rate in (8000, 16000, 32000, 48000) + pcm_data = wf.readframes(wf.getnframes()) + return pcm_data, sample_rate + + +def write_wave(path, audio, sample_rate): + """Writes a .wav file. + + Takes path, PCM audio data, and sample rate. + """ + with contextlib.closing(wave.open(path, "wb")) as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(sample_rate) + wf.writeframes(audio) + + +class Frame(object): + """Represents a "frame" of audio data.""" + + def __init__(self, _bytes, timestamp, duration): + self.bytes =_bytes + self.timestamp = timestamp + self.duration = duration + + +def frame_generator(frame_duration_ms, audio, sample_rate): + """Generates audio frames from PCM audio data. + + Takes the desired frame duration in milliseconds, the PCM data, and + the sample rate. + + Yields Frames of the requested duration. + """ + n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) + offset = 0 + timestamp = 0.0 + duration = (float(n) / sample_rate) / 2.0 + while offset + n < len(audio): + yield Frame(audio[offset : offset + n], timestamp, duration) + timestamp += duration + offset += n + + +def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames): + """Filters out non-voiced audio frames. + + Given a webrtcvad.Vad and a source of audio frames, yields only + the voiced audio. + + Uses a padded, sliding window algorithm over the audio frames. + When more than 90% of the frames in the window are voiced (as + reported by the VAD), the collector triggers and begins yielding + audio frames. Then the collector waits until 90% of the frames in + the window are unvoiced to detrigger. + + The window is padded at the front and back to provide a small + amount of silence or the beginnings/endings of speech around the + voiced frames. + + Arguments: + + sample_rate - The audio sample rate, in Hz. + frame_duration_ms - The frame duration in milliseconds. + padding_duration_ms - The amount to pad the window, in milliseconds. + vad - An instance of webrtcvad.Vad. + frames - a source of audio frames (sequence or generator). + + Returns: A generator that yields PCM audio data. + """ + num_padding_frames = int(padding_duration_ms / frame_duration_ms) + # We use a deque for our sliding window/ring buffer. + ring_buffer = collections.deque(maxlen=num_padding_frames) + # We have two states: TRIGGERED and NOTTRIGGERED. We start in the + # NOTTRIGGERED state. + triggered = False + + voiced_frames = [] + for frame in frames: + is_speech = vad.is_speech(frame.bytes, sample_rate) + + # sys.stdout.write('1' if is_speech else '0') + if not triggered: + ring_buffer.append((frame, is_speech)) + num_voiced = len([f for f, speech in ring_buffer if speech]) + # If we're NOTTRIGGERED and more than 90% of the frames in + # the ring buffer are voiced frames, then enter the + # TRIGGERED state. + if num_voiced > 0.9 * ring_buffer.maxlen: + triggered = True + # sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,)) + # We want to yield all the audio we see from now until + # we are NOTTRIGGERED, but we have to start with the + # audio that's already in the ring buffer. + for f, _ in ring_buffer: + voiced_frames.append(f) + ring_buffer.clear() + else: + # We're in the TRIGGERED state, so collect the audio data + # and add it to the ring buffer. + voiced_frames.append(frame) + ring_buffer.append((frame, is_speech)) + num_unvoiced = len([f for f, speech in ring_buffer if not speech]) + # If more than 90% of the frames in the ring buffer are + # unvoiced, then enter NOTTRIGGERED and yield whatever + # audio we've collected. + if num_unvoiced > 0.9 * ring_buffer.maxlen: + # sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration)) + triggered = False + yield b"".join([f.bytes for f in voiced_frames]) + ring_buffer.clear() + voiced_frames = [] + # If we have any leftover voiced audio when we run out of input, + # yield it. + if voiced_frames: + yield b"".join([f.bytes for f in voiced_frames]) + +def get_vad_speech_segments(audio, sample_rate, aggressiveness=2, padding_duration_ms=300): + + vad = webrtcvad.Vad(int(aggressiveness)) + frames = list(frame_generator(30, audio, sample_rate)) + segments = vad_collector(sample_rate, 30, padding_duration_ms, vad, frames) + + return segments \ No newline at end of file From 5fc127bb7a40038d3748e455f046283a96e0037d Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 08:19:36 -0300 Subject: [PATCH 053/220] Remove the unusable fine-tuning model --- TTS/tts/layers/losses.py | 10 +-- TTS/tts/models/vits.py | 159 ++------------------------------------- 2 files changed, 10 insertions(+), 159 deletions(-) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 93a5bad2..acf750a0 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -599,7 +599,6 @@ class VitsGeneratorLoss(nn.Module): feats_disc_fake, feats_disc_real, loss_duration, - fine_tuning_mode=0, use_speaker_encoder_as_loss=False, gt_spk_emb=None, syn_spk_emb=None, @@ -623,14 +622,9 @@ class VitsGeneratorLoss(nn.Module): # compute mel spectrograms from the waveforms mel = self.stft(waveform) mel_hat = self.stft(waveform_hat) + # compute losses - - # ignore tts model loss if fine tunning mode is on - if fine_tuning_mode: - loss_kl = 0.0 - else: - loss_kl = self.kl_loss(z_p, logs_q, m_p, logs_p, z_mask.unsqueeze(1)) * self.kl_loss_alpha - + loss_kl = self.kl_loss(z_p, logs_q, m_p, logs_p, z_mask.unsqueeze(1)) * self.kl_loss_alpha loss_feat = self.feature_loss(feats_disc_fake, feats_disc_real) * self.feat_loss_alpha loss_gen = self.generator_loss(scores_disc_fake)[0] * self.gen_loss_alpha loss_mel = torch.nn.functional.l1_loss(mel, mel_hat) * self.mel_loss_alpha diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 9f895fc1..0abf0ca3 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -167,11 +167,6 @@ class VitsArgs(Coqpit): speaker_encoder_model_path (str): Path to the file speaker encoder checkpoint file, to use for SCL. Defaults to "". - fine_tuning_mode (int): - Fine tuning only the vocoder part of the model, while the rest will be frozen. Defaults to 0. - Mode 0: Disabled; - Mode 1: uses the distribution predicted by the encoder and It's recommended for TTS; - Mode 2: uses the distribution predicted by the encoder and It's recommended for voice conversion. """ num_chars: int = 100 @@ -219,7 +214,6 @@ class VitsArgs(Coqpit): use_speaker_encoder_as_loss: bool = False speaker_encoder_config_path: str = "" speaker_encoder_model_path: str = "" - fine_tuning_mode: int = 0 freeze_encoder: bool = False freeze_DP: bool = False freeze_PE: bool = False @@ -672,122 +666,6 @@ class Vits(BaseTTS): ) return outputs - def forward_fine_tuning( - self, - x: torch.tensor, - x_lengths: torch.tensor, - y: torch.tensor, - y_lengths: torch.tensor, - aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}, - waveform=None, - ) -> Dict: - """Forward pass of the model. - - Args: - x (torch.tensor): Batch of input character sequence IDs. - x_lengths (torch.tensor): Batch of input character sequence lengths. - y (torch.tensor): Batch of input spectrograms. - y_lengths (torch.tensor): Batch of input spectrogram lengths. - aux_input (dict, optional): Auxiliary inputs for multi-speaker training. Defaults to {"d_vectors": None, "speaker_ids": None}. - - Returns: - Dict: model outputs keyed by the output name. - - Shapes: - - x: :math:`[B, T_seq]` - - x_lengths: :math:`[B]` - - y: :math:`[B, C, T_spec]` - - y_lengths: :math:`[B]` - - d_vectors: :math:`[B, C, 1]` - - speaker_ids: :math:`[B]` - """ - with torch.no_grad(): - outputs = {} - sid, g, lid = self._set_cond_input(aux_input) - # speaker embedding - if self.args.use_speaker_embedding and sid is not None and not self.use_d_vector: - g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] - - # language embedding - lang_emb = None - if self.args.use_language_embedding and lid is not None: - lang_emb = self.emb_l(lid).unsqueeze(-1) - - x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb) - - # posterior encoder - z, m_q, logs_q, y_mask = self.posterior_encoder(y, y_lengths, g=g) - - # flow layers - z_p = self.flow(z, y_mask, g=g) - - # find the alignment path - attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) - with torch.no_grad(): - o_scale = torch.exp(-2 * logs_p) - logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1]).unsqueeze(-1) # [b, t, 1] - logp2 = torch.einsum("klm, kln -> kmn", [o_scale, -0.5 * (z_p ** 2)]) - logp3 = torch.einsum("klm, kln -> kmn", [m_p * o_scale, z_p]) - logp4 = torch.sum(-0.5 * (m_p ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] - logp = logp2 + logp3 + logp1 + logp4 - attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() - - # expand prior - m_p = torch.einsum("klmn, kjm -> kjn", [attn, m_p]) - logs_p = torch.einsum("klmn, kjm -> kjn", [attn, logs_p]) - - # mode 1: like SC-GlowTTS paper; mode 2: recommended for voice conversion - if self.args.fine_tuning_mode == 1: - z_ft = m_p - elif self.args.fine_tuning_mode == 2: - z_ft = z_p - else: - raise RuntimeError(" [!] Invalid Fine Tunning Mode !") - - # inverse decoder and get the output - z_f_pred = self.flow(z_ft, y_mask, g=g, reverse=True) - z_slice, slice_ids = rand_segments(z_f_pred, y_lengths, self.spec_segment_size) - - o = self.waveform_decoder(z_slice, g=g) - - wav_seg = segment( - waveform.transpose(1, 2), - slice_ids * self.config.audio.hop_length, - self.args.spec_segment_size * self.config.audio.hop_length, - ) - - if self.args.use_speaker_encoder_as_loss and self.speaker_encoder is not None: - # concate generated and GT waveforms - wavs_batch = torch.cat((wav_seg, o), dim=0).squeeze(1) - - # resample audio to speaker encoder sample_rate - if self.audio_transform is not None: - wavs_batch = self.audio_transform(wavs_batch) - - pred_embs = self.speaker_encoder.forward(wavs_batch, l2_norm=True) - - # split generated and GT speaker embeddings - gt_spk_emb, syn_spk_emb = torch.chunk(pred_embs, 2, dim=0) - else: - gt_spk_emb, syn_spk_emb = None, None - - outputs.update( - { - "model_outputs": o, - "alignments": attn.squeeze(1), - "loss_duration": 0.0, - "z": z, - "z_p": z_p, - "m_p": m_p, - "logs_p": logs_p, - "m_q": m_q, - "logs_q": logs_q, - "waveform_seg": wav_seg, - "gt_spk_emb": gt_spk_emb, - "syn_spk_emb": syn_spk_emb, - } - ) - return outputs def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}): """ @@ -869,15 +747,6 @@ class Vits(BaseTTS): if optimizer_idx not in [0, 1]: raise ValueError(" [!] Unexpected `optimizer_idx`.") - # generator pass - if self.args.fine_tuning_mode: - # ToDo: find better place fot it - # force eval mode - self.eval() - # restore train mode for the vocoder part - self.waveform_decoder.train() - self.disc.train() - if self.args.freeze_encoder: for param in self.text_encoder.parameters(): param.requires_grad = False @@ -913,25 +782,14 @@ class Vits(BaseTTS): waveform = batch["waveform"] # generator pass - if self.args.fine_tuning_mode: - # model forward - outputs = self.forward_fine_tuning( - text_input, - text_lengths, - linear_input.transpose(1, 2), - mel_lengths, - aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids}, - waveform=waveform, - ) - else: - outputs = self.forward( - text_input, - text_lengths, - linear_input.transpose(1, 2), - mel_lengths, - aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids}, - waveform=waveform, - ) + outputs = self.forward( + text_input, + text_lengths, + linear_input.transpose(1, 2), + mel_lengths, + aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids}, + waveform=waveform, + ) # cache tensors for the discriminator self.y_disc_cache = None @@ -958,7 +816,6 @@ class Vits(BaseTTS): feats_disc_fake=outputs["feats_disc_fake"], feats_disc_real=outputs["feats_disc_real"], loss_duration=outputs["loss_duration"], - fine_tuning_mode=self.args.fine_tuning_mode, use_speaker_encoder_as_loss=self.args.use_speaker_encoder_as_loss, gt_spk_emb=outputs["gt_spk_emb"], syn_spk_emb=outputs["syn_spk_emb"], From 86b2536491405756b63aea832ff079ed15105070 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 08:48:56 -0300 Subject: [PATCH 054/220] Turn more clear the VITS loss function --- TTS/tts/layers/losses.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index acf750a0..9c219998 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -586,6 +586,11 @@ class VitsGeneratorLoss(nn.Module): l = kl / torch.sum(z_mask) return l + @staticmethod + def cosine_similarity_loss(gt_spk_emb, syn_spk_emb): + l = - torch.nn.functional.cosine_similarity(gt_spk_emb, syn_spk_emb).mean() + return l + def forward( self, waveform, @@ -632,9 +637,7 @@ class VitsGeneratorLoss(nn.Module): loss = loss_kl + loss_feat + loss_mel + loss_gen + loss_duration if use_speaker_encoder_as_loss: - loss_se = ( - -torch.nn.functional.cosine_similarity(gt_spk_emb, syn_spk_emb).mean() * self.spk_encoder_loss_alpha - ) + loss_se = self.cosine_similarity_loss(gt_spk_emb, syn_spk_emb) * self.spk_encoder_loss_alpha loss += loss_se return_dict["loss_spk_encoder"] = loss_se From ec31dacbb70d2356a34fe3ffa955028a8f9ca57d Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 08:57:13 -0300 Subject: [PATCH 055/220] Remove unusable speaker manager function --- TTS/tts/models/vits.py | 2 +- TTS/tts/utils/speakers.py | 14 -------------- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 0abf0ca3..7b27bc73 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -519,7 +519,7 @@ class Vits(BaseTTS): if speaker_name is None: d_vector = self.speaker_manager.get_random_d_vector() else: - d_vector = self.speaker_manager.get_d_vector_by_speaker(speaker_name) + d_vector = self.speaker_manager.get_mean_d_vector(speaker_name, num_samples=1, randomize=False) elif config.use_speaker_embedding: if speaker_name is None: speaker_id = self.speaker_manager.get_random_speaker_id() diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 8c248658..828abede 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -194,20 +194,6 @@ class SpeakerManager: """ return [x["embedding"] for x in self.d_vectors.values() if x["name"] == speaker_idx] - def get_d_vector_by_speaker(self, speaker_idx: str) -> np.ndarray: - """Get a d_vector of a speaker. - - Args: - speaker_idx (str): Target speaker ID. - - Returns: - np.ndarray: d_vector. - """ - for x in self.d_vectors.values(): - if x["name"] == speaker_idx: - return x["embedding"] - return None - def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: """Get mean d_vector of a speaker ID. From cad82a92960228ba44a5b2a4b0213163865c24a9 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 09:05:28 -0300 Subject: [PATCH 056/220] Remove the data from the set_d_vectors_from_file function --- TTS/tts/utils/speakers.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 828abede..c1eede3d 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -155,7 +155,7 @@ class SpeakerManager: """ self._save_json(file_path, self.d_vectors) - def set_d_vectors_from_file(self, file_path: str, data: List = None) -> None: + def set_d_vectors_from_file(self, file_path: str) -> None: """Load d_vectors from a json file. Args: @@ -163,12 +163,8 @@ class SpeakerManager: """ self.d_vectors = self._load_json(file_path) - # load speakers from data, because during the training we can just use some speakers from d_vector_file - if data is not None: - self.speaker_ids, _ = self.parse_speakers_from_data(data) - else: - speakers = sorted({x["name"] for x in self.d_vectors.values()}) - self.speaker_ids = {name: i for i, name in enumerate(speakers)} + speakers = sorted({x["name"] for x in self.d_vectors.values()}) + self.speaker_ids = {name: i for i, name in enumerate(speakers)} self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys()))) @@ -386,7 +382,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.d_vector_file" ) speaker_manager.load_d_vectors_file(c.d_vector_file) - speaker_manager.set_d_vectors_from_file(speakers_file, data=data) + speaker_manager.set_d_vectors_from_file(speakers_file) elif not c.use_d_vector_file: # restor speaker manager with speaker ID file. speaker_ids_from_data = speaker_manager.speaker_ids speaker_manager.set_speaker_ids_from_file(speakers_file) @@ -395,7 +391,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, ), " [!] You cannot introduce new speakers to a pre-trained model." elif c.use_d_vector_file and c.d_vector_file: # new speaker manager with external speaker embeddings. - speaker_manager.set_d_vectors_from_file(c.d_vector_file, data=data) + speaker_manager.set_d_vectors_from_file(c.d_vector_file) elif c.use_d_vector_file and not c.d_vector_file: raise "use_d_vector_file is True, so you need pass a external speaker embedding file." elif c.use_speaker_embedding and "speakers_file" in c and c.speakers_file: From 5ba416e2e8a474c3b02746eb59f534795707274e Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 09:18:29 -0300 Subject: [PATCH 057/220] Remove the call to get_speaker_manager --- TTS/bin/extract_tts_spectrograms.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 0af98ff1..e46e4a00 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -12,7 +12,7 @@ from tqdm import tqdm from TTS.config import load_config from TTS.tts.datasets import TTSDataset, load_tts_samples from TTS.tts.models import setup_model -from TTS.tts.utils.speakers import get_speaker_manager +from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import count_parameters @@ -234,8 +234,13 @@ def main(args): # pylint: disable=redefined-outer-name # use eval and training partitions meta_data = meta_data_train + meta_data_eval - # parse speakers - speaker_manager = get_speaker_manager(c, args, meta_data_train) + # init speaker manager + if config.use_speaker_embedding: + speaker_manager = SpeakerManager(data_items=meta_data) + elif config.use_d_vector_file: + speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file) + else: + speaker_manager = None # setup model model = setup_model(c) From d0cb700e7d046a4c6562baf6c0f7003afcd6040c Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 09:25:32 -0300 Subject: [PATCH 058/220] Set the new_fields in copy_model_files as None by default --- TTS/trainer.py | 4 ++-- TTS/utils/io.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index c151e716..b9026c8e 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -202,7 +202,7 @@ class Trainer: os.makedirs(output_path, exist_ok=True) # copy training assets to the output folder - copy_model_files(config, output_path, new_fields=None) + copy_model_files(config, output_path) # init class members self.args = args @@ -274,7 +274,7 @@ class Trainer: self.config.num_languages = self.model.language_manager.num_languages # update config file - copy_model_files(self.config, self.output_path, None) + copy_model_files(self.config, self.output_path) # setup criterion self.criterion = self.get_criterion(self.model) diff --git a/TTS/utils/io.py b/TTS/utils/io.py index a93f6118..54818ce9 100644 --- a/TTS/utils/io.py +++ b/TTS/utils/io.py @@ -26,7 +26,7 @@ class AttrDict(dict): self.__dict__ = self -def copy_model_files(config: Coqpit, out_path, new_fields): +def copy_model_files(config: Coqpit, out_path, new_fields=None): """Copy config.json and other model files to training folder and add new fields. From 8ae22725e178f2daf098802b22dc6c08874e1a48 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 13:43:44 -0300 Subject: [PATCH 059/220] Fix the bug in extract tts spectrograms --- TTS/bin/extract_tts_spectrograms.py | 9 +++++---- TTS/bin/train_tts.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index e46e4a00..014ba4e8 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -37,8 +37,8 @@ def setup_loader(ap, r, verbose=False): enable_eos_bos=c.enable_eos_bos_chars, use_noise_augment=False, verbose=verbose, - speaker_id_mapping=speaker_manager.speaker_ids, - d_vector_mapping=speaker_manager.d_vectors if c.use_speaker_embedding and c.use_d_vector_file else None, + speaker_id_mapping=speaker_manager.speaker_ids if c.use_speaker_embedding else None, + d_vector_mapping=speaker_manager.d_vectors if c.use_d_vector_file else None, ) if c.use_phonemes and c.compute_input_seq_cache: @@ -235,13 +235,14 @@ def main(args): # pylint: disable=redefined-outer-name meta_data = meta_data_train + meta_data_eval # init speaker manager - if config.use_speaker_embedding: + if c.use_speaker_embedding: speaker_manager = SpeakerManager(data_items=meta_data) - elif config.use_d_vector_file: + elif c.use_d_vector_file: speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file) else: speaker_manager = None + # setup model model = setup_model(c) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 1a9faf02..e28e9dec 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -64,7 +64,7 @@ def main(): train_samples=train_samples, eval_samples=eval_samples, training_assets={"audio_processor": ap}, - parse_command_line_args=True, + parse_command_line_args=False, ) trainer.fit() From 9781e4d51680f22c23938329c5264082f73f554a Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 15:55:00 -0300 Subject: [PATCH 060/220] Lint fixs --- TTS/bin/find_unique_phonemes.py | 4 ++-- TTS/speaker_encoder/models/resnet.py | 1 + TTS/tts/configs/vits_config.py | 4 ++-- TTS/utils/vad.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index a869df27..832ef082 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -7,7 +7,7 @@ from tqdm.contrib.concurrent import process_map from TTS.config import load_config from TTS.tts.datasets import load_tts_samples -from TTS.tts.utils.text import text2phone, phoneme_to_sequence +from TTS.tts.utils.text import text2phone def compute_phonemes(item): @@ -19,8 +19,8 @@ def compute_phonemes(item): return [] return list(set(ph)) - def main(): + # pylint: disable=W0601 global c # pylint: disable=bad-option-value parser = argparse.ArgumentParser( diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index 42f041b4..47b6f23f 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -176,6 +176,7 @@ class ResNetSpeakerEncoder(nn.Module): def forward(self, x, l2_norm=False): with torch.no_grad(): with torch.cuda.amp.autocast(enabled=False): + # if you torch spec compute it otherwise use the mel spec computed by the AP if self.use_torch_spec: x = self.torch_spec(x) else: diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index eeb74bbe..178992a7 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -154,6 +154,6 @@ class VitsConfig(BaseTTSConfig): d_vector_dim: int = None def __post_init__(self): - for key in self.model_args.keys(): + for key, val in self.model_args.items(): if hasattr(self, key): - self[key] = self.model_args[key] + self[key] = val diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py index 4e61f490..33548087 100644 --- a/TTS/utils/vad.py +++ b/TTS/utils/vad.py @@ -139,4 +139,4 @@ def get_vad_speech_segments(audio, sample_rate, aggressiveness=2, padding_durati frames = list(frame_generator(30, audio, sample_rate)) segments = vad_collector(sample_rate, 30, padding_duration_ms, vad, frames) - return segments \ No newline at end of file + return segments From 1251d043871c58c109acbdf57cd6d1afbfbbd8e2 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 16:41:46 -0300 Subject: [PATCH 061/220] Fix function name --- TTS/tts/models/vits.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 7b27bc73..ccd742b1 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -491,7 +491,7 @@ class Vits(BaseTTS): sid, g, lid = self._set_cond_input(aux_input) return {"speaker_id": sid, "style_wav": None, "d_vector": g, "language_id": lid} - def get_aux_input_from_test_setences(self, sentence_info): + def get_aux_input_from_test_sentences(self, sentence_info): if hasattr(self.config, "model_args"): config = self.config.model_args else: @@ -907,7 +907,7 @@ class Vits(BaseTTS): test_sentences = self.config.test_sentences for idx, s_info in enumerate(test_sentences): try: - aux_inputs = self.get_aux_input_from_test_setences(s_info) + aux_inputs = self.get_aux_input_from_test_sentences(s_info) wav, alignment, _, _ = synthesis( self, aux_inputs["text"], From 346bd931b2010df4180daf0ec4cbdf21164d4a93 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 22 Nov 2021 22:42:12 +0100 Subject: [PATCH 062/220] remove inference notebook --- .../VITS_d-vector_multilingual_exemple.ipynb | 223 ------------------ 1 file changed, 223 deletions(-) delete mode 100644 notebooks/VITS_d-vector_multilingual_exemple.ipynb diff --git a/notebooks/VITS_d-vector_multilingual_exemple.ipynb b/notebooks/VITS_d-vector_multilingual_exemple.ipynb deleted file mode 100644 index 41713295..00000000 --- a/notebooks/VITS_d-vector_multilingual_exemple.ipynb +++ /dev/null @@ -1,223 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "source": [ - "import IPython\n", - "import torch\n", - "\n", - "from IPython.display import Audio\n", - "\n", - "from TTS.config import load_config\n", - "from TTS.tts.models import setup_model\n", - "from TTS.tts.utils.synthesis import synthesis\n", - "from TTS.utils.audio import AudioProcessor" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 2, - "source": [ - "GENERAL_PATH = '/home/julian/workspace/train/VITS-pt-en-fr-lr/vits-August-29-2021_01+20PM-c68d7fa25/'\n", - "MODEL_PATH = GENERAL_PATH + 'best_model.pth.tar'\n", - "CONFIG_PATH = GENERAL_PATH + 'config.json'\n", - "TTS_LANGUAGES = GENERAL_PATH + \"language_ids.json\"\n", - "TTS_SPEAKERS = GENERAL_PATH + \"speakers.json\"\n", - "USE_CUDA = torch.cuda.is_available()" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 3, - "source": [ - "# load the config\n", - "C = load_config(CONFIG_PATH)\n", - "\n", - "# load the audio processor\n", - "ap = AudioProcessor(**C.audio)\n", - "\n", - "speaker_embedding = None\n", - "\n", - "C.model_args['d_vector_file'] = TTS_SPEAKERS\n", - "\n", - "model = setup_model(C)\n", - "model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)\n", - "cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n", - "model.load_state_dict(cp['model'])\n", - "\n", - "\n", - "model.eval()\n", - "\n", - "if USE_CUDA:\n", - " model = model.cuda()\n", - "\n", - "use_griffin_lim = True" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - " > Setting up Audio Processor...\n", - " | > sample_rate:16000\n", - " | > resample:False\n", - " | > num_mels:80\n", - " | > min_level_db:-100\n", - " | > frame_shift_ms:None\n", - " | > frame_length_ms:None\n", - " | > ref_level_db:20\n", - " | > fft_size:1024\n", - " | > power:1.5\n", - " | > preemphasis:0.0\n", - " | > griffin_lim_iters:60\n", - " | > signal_norm:False\n", - " | > symmetric_norm:True\n", - " | > mel_fmin:0\n", - " | > mel_fmax:None\n", - " | > spec_gain:1.0\n", - " | > stft_pad_mode:reflect\n", - " | > max_norm:4.0\n", - " | > clip_norm:True\n", - " | > do_trim_silence:True\n", - " | > trim_db:45\n", - " | > do_sound_norm:False\n", - " | > do_amp_to_db_linear:False\n", - " | > do_amp_to_db_mel:True\n", - " | > stats_path:None\n", - " | > base:2.718281828459045\n", - " | > hop_length:256\n", - " | > win_length:1024\n", - " > Using model: vits\n", - " > Speaker manager is loaded with 421 speakers: ED, MLS_10032, MLS_10058, MLS_10065, MLS_10082, MLS_10087, MLS_10177, MLS_103, MLS_10620, MLS_10827, MLS_10957, MLS_112, MLS_11247, MLS_1127, MLS_115, MLS_11743, MLS_11772, MLS_11795, MLS_11822, MLS_11875, MLS_11954, MLS_12205, MLS_123, MLS_1243, MLS_125, MLS_12501, MLS_12512, MLS_12541, MLS_12709, MLS_12713, MLS_12823, MLS_12899, MLS_12968, MLS_12981, MLS_13142, MLS_13177, MLS_1329, MLS_13611, MLS_13634, MLS_13655, MLS_13658, MLS_14, MLS_1474, MLS_1579, MLS_1590, MLS_1591, MLS_1624, MLS_1649, MLS_1664, MLS_1745, MLS_177, MLS_1770, MLS_1798, MLS_1805, MLS_1817, MLS_1840, MLS_1844, MLS_1869, MLS_1887, MLS_1977, MLS_1989, MLS_2033, MLS_204, MLS_2155, MLS_2284, MLS_2297, MLS_2316, MLS_2506, MLS_2544, MLS_2587, MLS_2596, MLS_2607, MLS_27, MLS_2771, MLS_2776, MLS_28, MLS_2825, MLS_2904, MLS_2926, MLS_2946, MLS_30, MLS_3060, MLS_3182, MLS_3190, MLS_3204, MLS_3267, MLS_3270, MLS_3319, MLS_3344, MLS_3370, MLS_3464, MLS_3503, MLS_3595, MLS_3698, MLS_4018, MLS_4174, MLS_4193, MLS_4336, MLS_4396, MLS_4512, MLS_4609, MLS_4650, MLS_4699, MLS_4724, MLS_4744, MLS_4937, MLS_5021, MLS_5077, MLS_52, MLS_5232, MLS_5295, MLS_5525, MLS_5526, MLS_5553, MLS_5595, MLS_5612, MLS_5764, MLS_577, MLS_579, MLS_5830, MLS_5840, MLS_5968, MLS_6070, MLS_6128, MLS_62, MLS_6249, MLS_6318, MLS_6348, MLS_6362, MLS_6381, MLS_66, MLS_6856, MLS_694, MLS_7032, MLS_707, MLS_7142, MLS_7150, MLS_7193, MLS_7200, MLS_7239, MLS_7377, MLS_7423, MLS_7438, MLS_7439, MLS_753, MLS_7591, MLS_7601, MLS_7614, MLS_7679, MLS_78, MLS_7848, MLS_8102, MLS_8128, MLS_8582, MLS_8778, MLS_9121, MLS_9242, MLS_928, MLS_94, MLS_9804, MLS_9854, VCTK_p225, VCTK_p226, VCTK_p227, VCTK_p228, VCTK_p229, VCTK_p230, VCTK_p231, VCTK_p232, VCTK_p233, VCTK_p234, VCTK_p236, VCTK_p237, VCTK_p238, VCTK_p239, VCTK_p240, VCTK_p241, VCTK_p243, VCTK_p244, VCTK_p245, VCTK_p246, VCTK_p247, VCTK_p248, VCTK_p249, VCTK_p250, VCTK_p251, VCTK_p252, VCTK_p253, VCTK_p254, VCTK_p255, VCTK_p256, VCTK_p257, VCTK_p258, VCTK_p259, VCTK_p260, VCTK_p261, VCTK_p262, VCTK_p263, VCTK_p264, VCTK_p265, VCTK_p266, VCTK_p267, VCTK_p268, VCTK_p269, VCTK_p270, VCTK_p271, VCTK_p272, VCTK_p273, VCTK_p274, VCTK_p275, VCTK_p276, VCTK_p277, VCTK_p278, VCTK_p279, VCTK_p280, VCTK_p281, VCTK_p282, VCTK_p283, VCTK_p284, VCTK_p285, VCTK_p286, VCTK_p287, VCTK_p288, VCTK_p292, VCTK_p293, VCTK_p294, VCTK_p295, VCTK_p297, VCTK_p298, VCTK_p299, VCTK_p300, VCTK_p301, VCTK_p302, VCTK_p303, VCTK_p304, VCTK_p305, VCTK_p306, VCTK_p307, VCTK_p308, VCTK_p310, VCTK_p311, VCTK_p312, VCTK_p313, VCTK_p314, VCTK_p316, VCTK_p317, VCTK_p318, VCTK_p323, VCTK_p326, VCTK_p329, VCTK_p330, VCTK_p333, VCTK_p334, VCTK_p335, VCTK_p336, VCTK_p339, VCTK_p340, VCTK_p341, VCTK_p343, VCTK_p345, VCTK_p347, VCTK_p351, VCTK_p360, VCTK_p361, VCTK_p362, VCTK_p363, VCTK_p364, VCTK_p374, VCTK_p376, bernard, elodie, ezwa, gilles_g_le_blanc, nadine_eckert_boulet, openSLR_afr0184, openSLR_afr1919, openSLR_afr2418, openSLR_afr6590, openSLR_afr7130, openSLR_afr7214, openSLR_afr8148, openSLR_afr8924, openSLR_afr8963, openSLR_jvf00264, openSLR_jvf00658, openSLR_jvf01392, openSLR_jvf02059, openSLR_jvf02884, openSLR_jvf03187, openSLR_jvf04679, openSLR_jvf04715, openSLR_jvf04982, openSLR_jvf05540, openSLR_jvf06207, openSLR_jvf06510, openSLR_jvf06941, openSLR_jvf07335, openSLR_jvf07638, openSLR_jvf08002, openSLR_jvf08305, openSLR_jvf08736, openSLR_jvf09039, openSLR_jvm00027, openSLR_jvm01519, openSLR_jvm01932, openSLR_jvm02326, openSLR_jvm03314, openSLR_jvm03424, openSLR_jvm03727, openSLR_jvm04175, openSLR_jvm04285, openSLR_jvm04588, openSLR_jvm05219, openSLR_jvm05522, openSLR_jvm05667, openSLR_jvm05970, openSLR_jvm06080, openSLR_jvm06383, openSLR_jvm07765, openSLR_jvm07875, openSLR_jvm08178, openSLR_jvm09724, openSLR_sso0145, openSLR_sso0493, openSLR_sso0806, openSLR_sso1266, openSLR_sso1367, openSLR_sso1801, openSLR_sso2388, openSLR_sso2910, openSLR_sso4592, openSLR_sso5945, openSLR_sso6499, openSLR_sso7801, openSLR_sso7821, openSLR_sso7876, openSLR_sso7912, openSLR_sso7934, openSLR_sso8596, openSLR_sso8777, openSLR_sso9892, openSLR_suf00297, openSLR_suf00600, openSLR_suf00691, openSLR_suf00994, openSLR_suf01056, openSLR_suf01359, openSLR_suf02092, openSLR_suf02395, openSLR_suf02953, openSLR_suf03712, openSLR_suf03887, openSLR_suf04190, openSLR_suf04646, openSLR_suf04748, openSLR_suf05051, openSLR_suf05507, openSLR_suf06543, openSLR_suf07302, openSLR_suf08338, openSLR_suf08703, openSLR_sum00060, openSLR_sum00454, openSLR_sum01038, openSLR_sum01552, openSLR_sum01596, openSLR_sum01855, openSLR_sum01899, openSLR_sum02716, openSLR_sum03391, openSLR_sum03650, openSLR_sum03694, openSLR_sum04208, openSLR_sum04511, openSLR_sum05186, openSLR_sum06003, openSLR_sum06047, openSLR_sum07842, openSLR_sum08659, openSLR_sum09243, openSLR_sum09637, openSLR_sum09757, openSLR_tsn0045, openSLR_tsn0378, openSLR_tsn0441, openSLR_tsn1483, openSLR_tsn1498, openSLR_tsn1932, openSLR_tsn2839, openSLR_tsn3342, openSLR_tsn3629, openSLR_tsn4506, openSLR_tsn4850, openSLR_tsn5628, openSLR_tsn6116, openSLR_tsn6206, openSLR_tsn6234, openSLR_tsn6459, openSLR_tsn7674, openSLR_tsn7693, openSLR_tsn7866, openSLR_tsn7896, openSLR_tsn8333, openSLR_tsn8512, openSLR_tsn8532, openSLR_tsn8914, openSLR_tsn9061, openSLR_tsn9365, openSLR_xho0050, openSLR_xho0120, openSLR_xho1547, openSLR_xho3616, openSLR_xho4280, openSLR_xho4291, openSLR_xho5378, openSLR_xho5680, openSLR_xho6975, openSLR_xho7590, openSLR_xho7599, openSLR_xho9446, zeckou\n" - ] - } - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 4, - "source": [ - "#set speaker\n", - "d_vector = model.speaker_manager.get_mean_d_vector('VCTK_p260')" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 5, - "source": [ - "model.language_manager.language_id_mapping" - ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "{'af': 0,\n", - " 'en': 1,\n", - " 'fr-fr': 2,\n", - " 'jv': 3,\n", - " 'pt-br': 4,\n", - " 'st': 5,\n", - " 'su': 6,\n", - " 'tn': 7,\n", - " 'xh': 8}" - ] - }, - "metadata": {}, - "execution_count": 5 - } - ], - "metadata": { - "scrolled": true - } - }, - { - "cell_type": "code", - "execution_count": 6, - "source": [ - "# set scales \n", - "model.noise_scale = 0.0 # defines the noise variance applied to the random z vector at inference.\n", - "model.length_scale = 1.0 # scaler for the duration predictor. The larger it is, the slower the speech.\n", - "model.noise_scale_w = 0.0 # defines the noise variance applied to the duration predictor z vector at inference.\n", - "model.inference_noise_scale = 0.5 # defines the noise variance applied to the random z vector at inference.\n", - "model.inference_noise_scale_dp = 0.6 # defines the noise variance applied to the duration predictor z vector at inference." - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 7, - "source": [ - "text = \"Il m'a fallu beaucoup de temps pour développer une voix, et maintenant que je l'ai, je ne vais pas me taire.\"\n", - "language_id = 2\n", - "wav, alignment, _, _ = synthesis(\n", - " model,\n", - " text,\n", - " C,\n", - " \"cuda\" in str(next(model.parameters()).device),\n", - " ap,\n", - " speaker_id=None,\n", - " d_vector=d_vector,\n", - " style_wav=None,\n", - " language_id=language_id,\n", - " enable_eos_bos_chars=C.enable_eos_bos_chars,\n", - " use_griffin_lim=True,\n", - " do_trim_silence=False,\n", - " ).values()\n", - "IPython.display.display(Audio(wav, rate=ap.sample_rate))" - ], - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {} - } - ], - "metadata": {} - } - ], - "metadata": { - "interpreter": { - "hash": "b925b73899c1545aa2d9bbcf4e8e1df4138a367d2daefc2707570579325ca4c0" - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3.8.10 64-bit ('TTS': conda)" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file From aa1a070d5858c322820f6e52989488746fdf5c79 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 18:42:54 -0300 Subject: [PATCH 063/220] Rename ununsed_speakers to ignored_speakers --- TTS/config/shared_configs.py | 2 +- TTS/tts/datasets/__init__.py | 7 +++--- TTS/tts/datasets/formatters.py | 42 +++++++++++++++++----------------- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index f1ea2e0f..c52cfe8a 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -198,7 +198,7 @@ class BaseDatasetConfig(Coqpit): name: str = "" path: str = "" meta_file_train: str = "" - ununsed_speakers: List[str] = None + ignored_speakers: List[str] = None language: str = "" meta_file_val: str = "" meta_file_attn_mask: str = "" diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index 3673e188..40eed7e3 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -67,22 +67,21 @@ def load_tts_samples( root_path = dataset["path"] meta_file_train = dataset["meta_file_train"] meta_file_val = dataset["meta_file_val"] - ununsed_speakers = dataset["ununsed_speakers"] + ignored_speakers = dataset["ignored_speakers"] language = dataset["language"] # setup the right data processor if formatter is None: formatter = _get_formatter_by_name(name) # load train set - meta_data_train = formatter(root_path, meta_file_train, ununsed_speakers=ununsed_speakers) - # TODO: remove the loops and pass language as a parameter to preprocessor for faster load + meta_data_train = formatter(root_path, meta_file_train, ignored_speakers=ignored_speakers) meta_data_train = [[*item, language] for item in meta_data_train] print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}") # load evaluation split if set if eval_split: if meta_file_val: - meta_data_eval = formatter(root_path, meta_file_val, ununsed_speakers=ununsed_speakers) + meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers) meta_data_eval = [[*item, language] for item in meta_data_eval] else: meta_data_eval, meta_data_train = split_dataset(meta_data_train) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 49a1ced4..1f23f85e 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -59,7 +59,7 @@ def mozilla_de(root_path, meta_file, **kwargs): # pylint: disable=unused-argume return items -def mailabs(root_path, meta_files=None, ununsed_speakers=None): +def mailabs(root_path, meta_files=None, ignored_speakers=None): """Normalizes M-AI-Labs meta data files to TTS format Args: @@ -88,8 +88,8 @@ def mailabs(root_path, meta_files=None, ununsed_speakers=None): continue speaker_name = speaker_name_match.group("speaker_name") # ignore speakers - if isinstance(ununsed_speakers, list): - if speaker_name in ununsed_speakers: + if isinstance(ignored_speakers, list): + if speaker_name in ignored_speakers: continue print(" | > {}".format(csv_file)) with open(txt_file, "r", encoding="utf-8") as ttf: @@ -197,7 +197,7 @@ def nancy(root_path, meta_file, **kwargs): # pylint: disable=unused-argument return items -def common_voice(root_path, meta_file, ununsed_speakers=None): +def common_voice(root_path, meta_file, ignored_speakers=None): """Normalize the common voice meta data file to TTS format.""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -209,15 +209,15 @@ def common_voice(root_path, meta_file, ununsed_speakers=None): text = cols[2] speaker_name = cols[0] # ignore speakers - if isinstance(ununsed_speakers, list): - if speaker_name in ununsed_speakers: + if isinstance(ignored_speakers, list): + if speaker_name in ignored_speakers: continue wav_file = os.path.join(root_path, "clips", cols[1].replace(".mp3", ".wav")) items.append([text, wav_file, "MCV_" + speaker_name]) return items -def libri_tts(root_path, meta_files=None, ununsed_speakers=None): +def libri_tts(root_path, meta_files=None, ignored_speakers=None): """https://ai.google/tools/datasets/libri-tts/""" items = [] if not meta_files: @@ -237,8 +237,8 @@ def libri_tts(root_path, meta_files=None, ununsed_speakers=None): wav_file = os.path.join(_root_path, file_name + ".wav") text = cols[2] # ignore speakers - if isinstance(ununsed_speakers, list): - if speaker_name in ununsed_speakers: + if isinstance(ignored_speakers, list): + if speaker_name in ignored_speakers: continue items.append([text, wav_file, "LTTS_" + speaker_name]) for item in items: @@ -265,7 +265,7 @@ def custom_turkish(root_path, meta_file, **kwargs): # pylint: disable=unused-ar # ToDo: add the dataset link when the dataset is released publicly -def brspeech(root_path, meta_file, ununsed_speakers=None): +def brspeech(root_path, meta_file, ignored_speakers=None): """BRSpeech 3.0 beta""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -278,14 +278,14 @@ def brspeech(root_path, meta_file, ununsed_speakers=None): text = cols[2] speaker_id = cols[3] # ignore speakers - if isinstance(ununsed_speakers, list): - if speaker_id in ununsed_speakers: + if isinstance(ignored_speakers, list): + if speaker_id in ignored_speakers: continue items.append([text, wav_file, speaker_id]) return items -def vctk(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): +def vctk(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None): """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" items = [] meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) @@ -293,8 +293,8 @@ def vctk(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) file_id = txt_file.split(".")[0] # ignore speakers - if isinstance(ununsed_speakers, list): - if speaker_id in ununsed_speakers: + if isinstance(ignored_speakers, list): + if speaker_id in ignored_speakers: continue with open(meta_file, "r", encoding="utf-8") as file_text: text = file_text.readlines()[0] @@ -304,7 +304,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): return items -def vctk_slim(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): # pylint: disable=unused-argument +def vctk_slim(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None): # pylint: disable=unused-argument """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" items = [] txt_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) @@ -312,8 +312,8 @@ def vctk_slim(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=No _, speaker_id, txt_file = os.path.relpath(text_file, root_path).split(os.sep) file_id = txt_file.split(".")[0] # ignore speakers - if isinstance(ununsed_speakers, list): - if speaker_id in ununsed_speakers: + if isinstance(ignored_speakers, list): + if speaker_id in ignored_speakers: continue wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav") items.append([None, wav_file, "VCTK_" + speaker_id]) @@ -321,7 +321,7 @@ def vctk_slim(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=No return items -def mls(root_path, meta_files=None, ununsed_speakers=None): +def mls(root_path, meta_files=None, ignored_speakers=None): """http://www.openslr.org/94/""" items = [] with open(os.path.join(root_path, meta_files), "r", encoding="utf-8") as meta: @@ -331,8 +331,8 @@ def mls(root_path, meta_files=None, ununsed_speakers=None): speaker, book, *_ = file.split("_") wav_file = os.path.join(root_path, os.path.dirname(meta_files), "audio", speaker, book, file + ".wav") # ignore speakers - if isinstance(ununsed_speakers, list): - if speaker in ununsed_speakers: + if isinstance(ignored_speakers, list): + if speaker in ignored_speakers: continue items.append([text, wav_file, "MLS_" + speaker]) return items From 88f4369c636060ce7c7ee0740d631259bda6f424 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 18:43:56 -0300 Subject: [PATCH 064/220] Fix the bug in multispeaker vits --- TTS/bin/train_tts.py | 9 +++++++++ tests/tts_tests/test_vits_multilingual_train.py | 6 ++++++ 2 files changed, 15 insertions(+) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index e28e9dec..f39ed259 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -47,8 +47,17 @@ def main(): # init speaker manager if config.use_speaker_embedding: speaker_manager = SpeakerManager(data_items=train_samples + eval_samples) + if hasattr(config, "model_args"): + config.model_args.num_speakers = len(speaker_manager.speaker_ids) + else: + config.num_speakers = len(speaker_manager.speaker_ids) + elif config.use_d_vector_file: speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file) + if hasattr(config, "model_args"): + config.model_args.num_speakers = len(speaker_manager.speaker_ids) + else: + config.num_speakers = len(speaker_manager.speaker_ids) else: speaker_manager = None diff --git a/tests/tts_tests/test_vits_multilingual_train.py b/tests/tts_tests/test_vits_multilingual_train.py index 04b42e61..a280e8c5 100644 --- a/tests/tts_tests/test_vits_multilingual_train.py +++ b/tests/tts_tests/test_vits_multilingual_train.py @@ -53,9 +53,15 @@ config.audio.trim_db = 60 # active multilingual mode config.model_args.use_language_embedding = True +config.use_language_embedding = True # active multispeaker mode config.model_args.use_speaker_embedding = True +config.use_speaker_embedding = True +# config.num_speakers=1 +# config.model_args.num_speakers=1 + config.model_args.use_d_vector_file = False +config.use_d_vector_file = False # active language sampler config.use_language_weighted_sampler = True From 5782df8ffe99d65e4fc32bf8b455baaca993e31d Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 18:52:59 -0300 Subject: [PATCH 065/220] Get the number speaker from the Speaker Manager property --- TTS/bin/train_tts.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index f39ed259..a543a947 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -48,16 +48,15 @@ def main(): if config.use_speaker_embedding: speaker_manager = SpeakerManager(data_items=train_samples + eval_samples) if hasattr(config, "model_args"): - config.model_args.num_speakers = len(speaker_manager.speaker_ids) + config.model_args.num_speakers = speaker_manager.num_speakers else: - config.num_speakers = len(speaker_manager.speaker_ids) - + config.num_speakers = speaker_manager.num_speakers elif config.use_d_vector_file: speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file) if hasattr(config, "model_args"): - config.model_args.num_speakers = len(speaker_manager.speaker_ids) + config.model_args.num_speakers = speaker_manager.num_speakers else: - config.num_speakers = len(speaker_manager.speaker_ids) + config.num_speakers = speaker_manager.num_speakers else: speaker_manager = None From 87059e3bbba1219b980f0f19eb7302230f2296c8 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 20:02:05 -0300 Subject: [PATCH 066/220] Add the language embedding dim in the duration predictor class --- TTS/tts/layers/glow_tts/duration_predictor.py | 5 +++++ TTS/tts/layers/vits/stochastic_duration_predictor.py | 6 +++++- TTS/tts/models/vits.py | 4 ++-- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/TTS/tts/layers/glow_tts/duration_predictor.py b/TTS/tts/layers/glow_tts/duration_predictor.py index f46c73a9..e766ed6a 100644 --- a/TTS/tts/layers/glow_tts/duration_predictor.py +++ b/TTS/tts/layers/glow_tts/duration_predictor.py @@ -20,6 +20,11 @@ class DurationPredictor(nn.Module): def __init__(self, in_channels, hidden_channels, kernel_size, dropout_p, cond_channels=None, language_emb_dim=None): super().__init__() + + # add language embedding dim in the input + if language_emb_dim: + in_channels += language_emb_dim + # class arguments self.in_channels = in_channels self.filter_channels = hidden_channels diff --git a/TTS/tts/layers/vits/stochastic_duration_predictor.py b/TTS/tts/layers/vits/stochastic_duration_predictor.py index 7c25156a..120d0944 100644 --- a/TTS/tts/layers/vits/stochastic_duration_predictor.py +++ b/TTS/tts/layers/vits/stochastic_duration_predictor.py @@ -185,10 +185,14 @@ class StochasticDurationPredictor(nn.Module): dropout_p: float, num_flows=4, cond_channels=0, - language_emb_dim=None, + language_emb_dim=0, ): super().__init__() + # add language embedding dim in the input + if language_emb_dim: + in_channels += language_emb_dim + # condition encoder text self.pre = nn.Conv1d(in_channels, hidden_channels, 1) self.convs = DilatedDepthSeparableConv(hidden_channels, kernel_size, num_layers=3, dropout_p=dropout_p) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index ccd742b1..6b1dd325 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -321,7 +321,7 @@ class Vits(BaseTTS): if args.use_sdp: self.duration_predictor = StochasticDurationPredictor( - args.hidden_channels + self.embedded_language_dim, + args.hidden_channels, 192, 3, args.dropout_p_duration_predictor, @@ -331,7 +331,7 @@ class Vits(BaseTTS): ) else: self.duration_predictor = DurationPredictor( - args.hidden_channels + self.embedded_language_dim, + args.hidden_channels, 256, 3, args.dropout_p_duration_predictor, From 65b5ed4427981b90d355df7c0083d14fb32f7b0e Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 20:19:37 -0300 Subject: [PATCH 067/220] Add VITS multilingual d-vectors unit test --- .../test_vits_multilingual_train-d_vectors.py | 93 +++++++++++++++++++ .../tts_tests/test_vits_multilingual_train.py | 8 +- 2 files changed, 99 insertions(+), 2 deletions(-) create mode 100644 tests/tts_tests/test_vits_multilingual_train-d_vectors.py diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py new file mode 100644 index 00000000..f426e383 --- /dev/null +++ b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py @@ -0,0 +1,93 @@ +import glob +import os +import shutil + +from tests import get_device_id, get_tests_output_path, run_cli +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.configs.vits_config import VitsConfig + +config_path = os.path.join(get_tests_output_path(), "test_model_config.json") +output_path = os.path.join(get_tests_output_path(), "train_outputs") + + +dataset_config1 = BaseDatasetConfig( + name="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="en", +) + +dataset_config2 = BaseDatasetConfig( + name="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="en2", +) + +config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + use_espeak_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech-0", None, "en"], + ["Be a voice, not an echo.", "ljspeech-1", None, "en2"], + ], + datasets=[dataset_config1, dataset_config2], +) +# set audio config +config.audio.do_trim_silence = True +config.audio.trim_db = 60 + +# active multilingual mode +config.model_args.use_language_embedding = True +config.use_language_embedding = True + +# deactivate multispeaker mode +config.model_args.use_speaker_embedding = False +config.use_speaker_embedding = False + +# active multispeaker d-vec mode +config.model_args.use_d_vector_file = True +config.use_d_vector_file = True +config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json" +config.d_vector_file = "tests/data/ljspeech/speakers.json" +config.model_args.d_vector_dim = 256 +config.d_vector_dim = 256 + +# duration predictor +config.model_args.use_sdp = True +config.use_sdp = True + +# deactivate language sampler +config.use_language_weighted_sampler = False + +config.save_json(config_path) + +# train the model for one epoch +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.test_delay_epochs 0" +) +run_cli(command_train) + +# Find latest folder +continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + +# restore the model and continue training for one more epoch +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " +run_cli(command_train) +shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_vits_multilingual_train.py b/tests/tts_tests/test_vits_multilingual_train.py index a280e8c5..90f589d0 100644 --- a/tests/tts_tests/test_vits_multilingual_train.py +++ b/tests/tts_tests/test_vits_multilingual_train.py @@ -57,11 +57,15 @@ config.use_language_embedding = True # active multispeaker mode config.model_args.use_speaker_embedding = True config.use_speaker_embedding = True -# config.num_speakers=1 -# config.model_args.num_speakers=1 +# deactivate multispeaker d-vec mode config.model_args.use_d_vector_file = False config.use_d_vector_file = False + +# duration predictor +config.model_args.use_sdp = False +config.use_sdp = False + # active language sampler config.use_language_weighted_sampler = True From 9d8d4e6fb3048239eb812cb5667e08e8b8aa43d3 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 23 Nov 2021 14:50:19 +0100 Subject: [PATCH 068/220] Update docstring --- TTS/tts/models/vits.py | 18 +++++++++++++++++- TTS/tts/utils/synthesis.py | 6 ++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 6b1dd325..a9d00213 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -167,6 +167,20 @@ class VitsArgs(Coqpit): speaker_encoder_model_path (str): Path to the file speaker encoder checkpoint file, to use for SCL. Defaults to "". + freeze_encoder (bool): + Freeze the encoder weigths during training. Defaults to False. + + freeze_DP (bool): + Freeze the duration predictor weigths during training. Defaults to False. + + freeze_PE (bool): + Freeze the posterior encoder weigths during training. Defaults to False. + + freeze_flow_encoder (bool): + Freeze the flow encoder weigths during training. Defaults to False. + + freeze_waveform_decoder (bool): + Freeze the waveform decoder weigths during training. Defaults to False. """ num_chars: int = 100 @@ -555,7 +569,8 @@ class Vits(BaseTTS): x_lengths (torch.tensor): Batch of input character sequence lengths. y (torch.tensor): Batch of input spectrograms. y_lengths (torch.tensor): Batch of input spectrogram lengths. - aux_input (dict, optional): Auxiliary inputs for multi-speaker training. Defaults to {"d_vectors": None, "speaker_ids": None}. + aux_input (dict, optional): Auxiliary inputs for multi-speaker and multi-lingual training. + Defaults to {"d_vectors": None, "speaker_ids": None, "language_ids": None}. Returns: Dict: model outputs keyed by the output name. @@ -567,6 +582,7 @@ class Vits(BaseTTS): - y_lengths: :math:`[B]` - d_vectors: :math:`[B, C, 1]` - speaker_ids: :math:`[B]` + - language_ids: :math:`[B]` """ outputs = {} sid, g, lid = self._set_cond_input(aux_input) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 102914c5..24b747be 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -249,6 +249,12 @@ def synthesis( d_vector (torch.Tensor): d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None. + language_id (int): + Language ID passed to the language embedding layer in multi-langual model. Defaults to None. + + language_name (str): + Language name corresponding to the language code used by the phonemizer. Defaults to None. + backend (str): tf or torch. Defaults to "torch". """ From 36ddf3297277964bfe602fb95b19206eb09f3df1 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 23 Nov 2021 15:24:03 +0100 Subject: [PATCH 069/220] Fix trailing whitespace --- TTS/tts/models/vits.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index a9d00213..ac0f5d69 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -569,7 +569,7 @@ class Vits(BaseTTS): x_lengths (torch.tensor): Batch of input character sequence lengths. y (torch.tensor): Batch of input spectrograms. y_lengths (torch.tensor): Batch of input spectrogram lengths. - aux_input (dict, optional): Auxiliary inputs for multi-speaker and multi-lingual training. + aux_input (dict, optional): Auxiliary inputs for multi-speaker and multi-lingual training. Defaults to {"d_vectors": None, "speaker_ids": None, "language_ids": None}. Returns: @@ -592,7 +592,7 @@ class Vits(BaseTTS): # language embedding lang_emb = None - if self.args.use_language_embedding and lid is not None: + if hasattr(self, "emb_l"): lang_emb = self.emb_l(lid).unsqueeze(-1) x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb) From 0359cab4fa81cd80f14fce624cda76d99ddcb6d7 Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 23 Nov 2021 11:24:36 -0300 Subject: [PATCH 070/220] Remove torchaudio requeriment --- TTS/speaker_encoder/models/resnet.py | 26 ++++++++++++++++++++++---- TTS/tts/models/vits.py | 21 ++++++++++++--------- TTS/utils/audio.py | 12 +++++++++++- requirements.txt | 2 -- 4 files changed, 45 insertions(+), 16 deletions(-) diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index 47b6f23f..8f0a8809 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -1,7 +1,10 @@ import numpy as np import torch -import torchaudio -import torch.nn as nn +from torch import nn + +# import torchaudio + +from TTS.utils.audio import TorchSTFT from TTS.utils.io import load_fsspec @@ -110,14 +113,29 @@ class ResNetSpeakerEncoder(nn.Module): if self.use_torch_spec: self.torch_spec = torch.nn.Sequential( PreEmphasis(audio_config["preemphasis"]), - torchaudio.transforms.MelSpectrogram( + TorchSTFT( + n_fft=audio_config["fft_size"], + hop_length=audio_config["hop_length"], + win_length=audio_config["win_length"], + sample_rate=audio_config["sample_rate"], + window="hamming_window", + mel_fmin=0.0, + mel_fmax=None, + use_htk=True, + do_amp_to_db=False, + n_mels=audio_config["num_mels"], + power=2.0, + use_mel=True, + mel_norm=None + ), + '''torchaudio.transforms.MelSpectrogram( sample_rate=audio_config["sample_rate"], n_fft=audio_config["fft_size"], win_length=audio_config["win_length"], hop_length=audio_config["hop_length"], window_fn=torch.hamming_window, n_mels=audio_config["num_mels"], - ), + ),''' ) else: self.torch_spec = None diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index ac0f5d69..4eb12b3b 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -4,7 +4,7 @@ from itertools import chain from typing import Dict, List, Tuple import torch -import torchaudio +# import torchaudio from coqpit import Coqpit from torch import nn from torch.cuda.amp.autocast_mode import autocast @@ -395,7 +395,7 @@ class Vits(BaseTTS): if config.use_speaker_encoder_as_loss: if not config.speaker_encoder_model_path or not config.speaker_encoder_config_path: raise RuntimeError( - " [!] To use the speaker encoder loss you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!" + " [!] To use the speaker consistency loss (SCL) you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!" ) self.speaker_manager.init_speaker_encoder( config.speaker_encoder_model_path, config.speaker_encoder_config_path @@ -410,14 +410,17 @@ class Vits(BaseTTS): hasattr(self.speaker_encoder, "audio_config") and self.audio_config["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"] ): - self.audio_transform = torchaudio.transforms.Resample( + raise RuntimeError( + " [!] To use the speaker consistency loss (SCL) you need to have the TTS model sampling rate ({}) equal to the speaker encoder sampling rate ({}) !".format(self.audio_config["sample_rate"], self.speaker_encoder.audio_config["sample_rate"]) + ) + '''self.audio_transform = torchaudio.transforms.Resample( orig_freq=self.audio_config["sample_rate"], new_freq=self.speaker_encoder.audio_config["sample_rate"], - ) - else: - self.audio_transform = None + ) + else: + self.audio_transform = None''' else: - self.audio_transform = None + # self.audio_transform = None self.speaker_encoder = None def _init_speaker_embedding(self, config): @@ -655,8 +658,8 @@ class Vits(BaseTTS): wavs_batch = torch.cat((wav_seg, o), dim=0).squeeze(1) # resample audio to speaker encoder sample_rate - if self.audio_transform is not None: - wavs_batch = self.audio_transform(wavs_batch) + '''if self.audio_transform is not None: + wavs_batch = self.audio_transform(wavs_batch)''' pred_embs = self.speaker_encoder.forward(wavs_batch, l2_norm=True) diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index e64b95e0..d650c288 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -32,6 +32,9 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method use_mel=False, do_amp_to_db=False, spec_gain=1.0, + power=None, + use_htk=False, + mel_norm="slaney" ): super().__init__() self.n_fft = n_fft @@ -45,6 +48,9 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method self.use_mel = use_mel self.do_amp_to_db = do_amp_to_db self.spec_gain = spec_gain + self.power = power + self.use_htk = use_htk + self.mel_norm = mel_norm self.window = nn.Parameter(getattr(torch, window)(win_length), requires_grad=False) self.mel_basis = None if use_mel: @@ -83,6 +89,10 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method M = o[:, :, :, 0] P = o[:, :, :, 1] S = torch.sqrt(torch.clamp(M ** 2 + P ** 2, min=1e-8)) + + if self.power is not None: + S = S ** self.power + if self.use_mel: S = torch.matmul(self.mel_basis.to(x), S) if self.do_amp_to_db: @@ -91,7 +101,7 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method def _build_mel_basis(self): mel_basis = librosa.filters.mel( - self.sample_rate, self.n_fft, n_mels=self.n_mels, fmin=self.mel_fmin, fmax=self.mel_fmax + self.sample_rate, self.n_fft, n_mels=self.n_mels, fmin=self.mel_fmin, fmax=self.mel_fmax, htk=self.use_htk, norm=self.mel_norm ) self.mel_basis = torch.from_numpy(mel_basis).float() diff --git a/requirements.txt b/requirements.txt index cf4798b2..3ec33ceb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,5 +26,3 @@ unidic-lite==1.0.8 gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=2.0.0 fsspec>=2021.04.0 pyworld -webrtcvad -torchaudio>=0.7 From 93dbb67c525224ebeff8ffff14c29d8fa90389f5 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 23 Nov 2021 15:40:03 +0100 Subject: [PATCH 071/220] Remove self.audio_config from VITS --- TTS/tts/models/vits.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 4eb12b3b..6acf2000 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -270,7 +270,6 @@ class Vits(BaseTTS): super().__init__(config) self.END2END = True - self.speaker_manager = speaker_manager self.audio_config = config["audio"] if config.__class__.__name__ == "VitsConfig": @@ -408,7 +407,7 @@ class Vits(BaseTTS): if ( hasattr(self.speaker_encoder, "audio_config") - and self.audio_config["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"] + and self.config.audio["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"] ): raise RuntimeError( " [!] To use the speaker consistency loss (SCL) you need to have the TTS model sampling rate ({}) equal to the speaker encoder sampling rate ({}) !".format(self.audio_config["sample_rate"], self.speaker_encoder.audio_config["sample_rate"]) From 79f75924de350bc2f7070a903aa5afe11798d72e Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 23 Nov 2021 11:43:08 -0300 Subject: [PATCH 072/220] Fix pylint checks --- TTS/tts/models/vits.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 6acf2000..cc86e119 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -412,12 +412,14 @@ class Vits(BaseTTS): raise RuntimeError( " [!] To use the speaker consistency loss (SCL) you need to have the TTS model sampling rate ({}) equal to the speaker encoder sampling rate ({}) !".format(self.audio_config["sample_rate"], self.speaker_encoder.audio_config["sample_rate"]) ) - '''self.audio_transform = torchaudio.transforms.Resample( - orig_freq=self.audio_config["sample_rate"], - new_freq=self.speaker_encoder.audio_config["sample_rate"], - ) - else: - self.audio_transform = None''' + # pylint: disable=W0101,W0105 + """ self.audio_transform = torchaudio.transforms.Resample( + orig_freq=self.audio_config["sample_rate"], + new_freq=self.speaker_encoder.audio_config["sample_rate"], + ) + else: + self.audio_transform = None + """ else: # self.audio_transform = None self.speaker_encoder = None @@ -657,8 +659,9 @@ class Vits(BaseTTS): wavs_batch = torch.cat((wav_seg, o), dim=0).squeeze(1) # resample audio to speaker encoder sample_rate - '''if self.audio_transform is not None: - wavs_batch = self.audio_transform(wavs_batch)''' + # pylint: disable=W0105 + """if self.audio_transform is not None: + wavs_batch = self.audio_transform(wavs_batch)""" pred_embs = self.speaker_encoder.forward(wavs_batch, l2_norm=True) From 9c1bec86a4631adaeb3295fc4b779f62c8ba1fca Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 23 Nov 2021 16:00:38 +0100 Subject: [PATCH 073/220] Fix tests --- TTS/tts/models/vits.py | 2 +- .../test_vits_multilingual_train-d_vectors.py | 10 +++++----- tests/tts_tests/test_vits_multilingual_train.py | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index cc86e119..1b6d29d4 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -596,7 +596,7 @@ class Vits(BaseTTS): # language embedding lang_emb = None - if hasattr(self, "emb_l"): + if self.args.use_language_embedding and lid is not None: lang_emb = self.emb_l(lid).unsqueeze(-1) x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb) diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py index f426e383..0e9827f1 100644 --- a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py +++ b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py @@ -10,7 +10,7 @@ config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") -dataset_config1 = BaseDatasetConfig( +dataset_config_en = BaseDatasetConfig( name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", @@ -18,12 +18,12 @@ dataset_config1 = BaseDatasetConfig( language="en", ) -dataset_config2 = BaseDatasetConfig( +dataset_config_pt = BaseDatasetConfig( name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", path="tests/data/ljspeech", - language="en2", + language="pt-br", ) config = VitsConfig( @@ -43,9 +43,9 @@ config = VitsConfig( print_eval=True, test_sentences=[ ["Be a voice, not an echo.", "ljspeech-0", None, "en"], - ["Be a voice, not an echo.", "ljspeech-1", None, "en2"], + ["Be a voice, not an echo.", "ljspeech-1", None, "pt-br"], ], - datasets=[dataset_config1, dataset_config2], + datasets=[dataset_config_en, dataset_config_pt], ) # set audio config config.audio.do_trim_silence = True diff --git a/tests/tts_tests/test_vits_multilingual_train.py b/tests/tts_tests/test_vits_multilingual_train.py index 90f589d0..50cccca5 100644 --- a/tests/tts_tests/test_vits_multilingual_train.py +++ b/tests/tts_tests/test_vits_multilingual_train.py @@ -10,7 +10,7 @@ config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") -dataset_config1 = BaseDatasetConfig( +dataset_config_en = BaseDatasetConfig( name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", @@ -18,12 +18,12 @@ dataset_config1 = BaseDatasetConfig( language="en", ) -dataset_config2 = BaseDatasetConfig( +dataset_config_pt = BaseDatasetConfig( name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", path="tests/data/ljspeech", - language="en2", + language="pt-br", ) config = VitsConfig( @@ -43,9 +43,9 @@ config = VitsConfig( print_eval=True, test_sentences=[ ["Be a voice, not an echo.", "ljspeech", None, "en"], - ["Be a voice, not an echo.", "ljspeech", None, "en2"], + ["Be a voice, not an echo.", "ljspeech", None, "pt-br"], ], - datasets=[dataset_config1, dataset_config2], + datasets=[dataset_config_en, dataset_config_pt], ) # set audio config config.audio.do_trim_silence = True From eff0a5ca106546c97acc8fec4981b0a5cc4f9733 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 23 Nov 2021 16:25:21 +0100 Subject: [PATCH 074/220] Fix merge bug --- TTS/tts/models/vits.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 1b6d29d4..8ccbce36 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -410,7 +410,8 @@ class Vits(BaseTTS): and self.config.audio["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"] ): raise RuntimeError( - " [!] To use the speaker consistency loss (SCL) you need to have the TTS model sampling rate ({}) equal to the speaker encoder sampling rate ({}) !".format(self.audio_config["sample_rate"], self.speaker_encoder.audio_config["sample_rate"]) + ' [!] To use the speaker consistency loss (SCL) you need to have matching sample rates between the TTS model ({}) and the speaker encoder ({})!' + .format(self.config.audio["sample_rate"], self.speaker_encoder.audio_config["sample_rate"]) ) # pylint: disable=W0101,W0105 """ self.audio_transform = torchaudio.transforms.Resample( From 4a8c344fc820afb7e37bb4e19ef942934d10c960 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 23 Nov 2021 16:29:13 +0100 Subject: [PATCH 075/220] Make a multilingual test use chars --- tests/tts_tests/test_vits_multilingual_train-d_vectors.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py index 0e9827f1..1ca57d93 100644 --- a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py +++ b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py @@ -32,9 +32,7 @@ config = VitsConfig( num_loader_workers=0, num_eval_loader_workers=0, text_cleaner="english_cleaners", - use_phonemes=True, - use_espeak_phonemes=True, - phoneme_language="en-us", + use_phonemes=False, phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, test_delay_epochs=-1, From be8f4446362674ac3c830e8a86b66f4f7ab0c2a6 Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 23 Nov 2021 13:37:14 -0300 Subject: [PATCH 076/220] Add the SCL resample TODO --- TTS/tts/models/vits.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 8ccbce36..c1c29980 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -409,6 +409,7 @@ class Vits(BaseTTS): hasattr(self.speaker_encoder, "audio_config") and self.config.audio["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"] ): + # TODO: change this with torchaudio Resample raise RuntimeError( ' [!] To use the speaker consistency loss (SCL) you need to have matching sample rates between the TTS model ({}) and the speaker encoder ({})!' .format(self.config.audio["sample_rate"], self.speaker_encoder.audio_config["sample_rate"]) @@ -418,8 +419,8 @@ class Vits(BaseTTS): orig_freq=self.audio_config["sample_rate"], new_freq=self.speaker_encoder.audio_config["sample_rate"], ) - else: - self.audio_transform = None + else: + self.audio_transform = None """ else: # self.audio_transform = None From c382a3f6a9cfe627242ebd1064d89f9091573dd8 Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 26 Nov 2021 17:14:29 -0300 Subject: [PATCH 077/220] Add find unique phonemes unit tests --- tests/aux_tests/test_find_unique_phonemes.py | 84 ++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 tests/aux_tests/test_find_unique_phonemes.py diff --git a/tests/aux_tests/test_find_unique_phonemes.py b/tests/aux_tests/test_find_unique_phonemes.py new file mode 100644 index 00000000..33fad9ba --- /dev/null +++ b/tests/aux_tests/test_find_unique_phonemes.py @@ -0,0 +1,84 @@ +import os +import unittest + +import torch + +from tests import get_tests_output_path, run_cli +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.configs.vits_config import VitsConfig + +torch.manual_seed(1) + +config_path = os.path.join(get_tests_output_path(), "test_model_config.json") + +dataset_config_en = BaseDatasetConfig( + name="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="en", +) + +dataset_config_pt = BaseDatasetConfig( + name="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="pt-br", +) + +# pylint: disable=protected-access +class TestFindUniquePhonemes(unittest.TestCase): + @staticmethod + def test_espeak_phonemes(): + # prepare the config + config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + use_espeak_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + datasets=[dataset_config_en, dataset_config_pt], + ) + config.save_json(config_path) + + # run test + run_cli( + f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"' + ) + + @staticmethod + def test_no_espeak_phonemes(): + # prepare the config + config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + use_espeak_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + datasets=[dataset_config_en, dataset_config_pt], + ) + config.save_json(config_path) + + # run test + run_cli( + f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"' + ) From d79531c1770bd9301a84de0cf32a7dfb91c2cfa5 Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 26 Nov 2021 17:14:58 -0300 Subject: [PATCH 078/220] Add remove silence vad script Unit test --- .../test_remove_silence_vad_script.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 tests/aux_tests/test_remove_silence_vad_script.py diff --git a/tests/aux_tests/test_remove_silence_vad_script.py b/tests/aux_tests/test_remove_silence_vad_script.py new file mode 100644 index 00000000..c934e065 --- /dev/null +++ b/tests/aux_tests/test_remove_silence_vad_script.py @@ -0,0 +1,29 @@ +import os +import unittest + +import torch + +from tests import get_tests_input_path, get_tests_output_path, run_cli + +torch.manual_seed(1) + +# pylint: disable=protected-access +class TestRemoveSilenceVAD(unittest.TestCase): + @staticmethod + def test(): + # set paths + wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs") + output_path = os.path.join(get_tests_output_path(), "output_wavs_removed_silence/") + output_resample_path = os.path.join(get_tests_output_path(), "output_ljspeech_16khz/") + + # resample audios + run_cli( + f'CUDA_VISIBLE_DEVICES="" python TTS/bin/resample.py --input_dir "{wav_path}" --output_dir "{output_resample_path}" --output_sr 16000' + ) + + # run test + run_cli( + f'CUDA_VISIBLE_DEVICES="" python TTS/bin/remove_silence_using_vad.py --input_dir "{output_resample_path}" --output_dir "{output_path}"' + ) + run_cli(f'rm -rf "{output_resample_path}"') + run_cli(f'rm -rf "{output_path}"') From b909a3b63eec7d08701726a2c9a740ea1b8afccb Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 26 Nov 2021 17:42:26 -0300 Subject: [PATCH 079/220] Add Docstring for TorchSTFT --- TTS/utils/audio.py | 56 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index d650c288..10c9ec7e 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -16,6 +16,60 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method """Some of the audio processing funtions using Torch for faster batch processing. TODO: Merge this with audio.py + + Args: + + n_fft (int): + FFT window size for STFT. + + hop_length (int): + number of frames between STFT columns. + + win_length (int, optional): + STFT window length. + + pad_wav (bool, optional): + If True pad the audio with (n_fft - hop_length) / 2). Defaults to False. + + window (str, optional): + The name of a function to create a window tensor that is applied/multiplied to each frame/window. Defaults to "hann_window" + + sample_rate (int, optional): + target audio sampling rate. Defaults to None. + + mel_fmin (int, optional): + minimum filter frequency for computing melspectrograms. Defaults to None. + + mel_fmax (int, optional): + maximum filter frequency for computing melspectrograms. Defaults to None. + + n_mels (int, optional): + number of melspectrogram dimensions. Defaults to None. + + use_mel (bool, optional): + If True compute the melspectrograms otherwise. Defaults to False. + + do_amp_to_db_linear (bool, optional): + enable/disable amplitude to dB conversion of linear spectrograms. Defaults to False. + + spec_gain (float, optional): + gain applied when converting amplitude to DB. Defaults to 1.0. + + power (float, optional): + Exponent for the magnitude spectrogram, e.g., 1 for energy, 2 for power, etc. Defaults to None. + + use_htk (bool, optional): + Use HTK formula in mel filter instead of Slaney. + + mel_norm (None, 'slaney', or number, optional): + If 'slaney', divide the triangular mel weights by the width of the mel band + (area normalization). + + If numeric, use `librosa.util.normalize` to normalize each filter by to unit l_p norm. + See `librosa.util.normalize` for a full description of supported norm values + (including `+-np.inf`). + + Otherwise, leave all the triangles aiming for a peak value of 1.0. Defaults to "slaney". """ def __init__( @@ -177,7 +231,7 @@ class AudioProcessor(object): minimum filter frequency for computing melspectrograms. Defaults to None. mel_fmax (int, optional): - maximum filter frequency for computing melspectrograms.. Defaults to None. + maximum filter frequency for computing melspectrograms. Defaults to None. spec_gain (int, optional): gain applied when converting amplitude to DB. Defaults to 20. From 352b4be1049c876a4a501c9f7326f8a3411f6820 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Sat, 27 Nov 2021 22:55:21 +0100 Subject: [PATCH 080/220] Move multilingual logic out of the trainer --- TTS/bin/train_tts.py | 12 +++++- TTS/trainer.py | 16 -------- TTS/tts/configs/vits_config.py | 8 ++++ TTS/tts/models/__init__.py | 11 ++++- TTS/tts/models/base_tts.py | 13 +++++- TTS/tts/models/vits.py | 48 ++++++++-------------- TTS/tts/utils/languages.py | 73 ++++++++++++---------------------- 7 files changed, 82 insertions(+), 99 deletions(-) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index a543a947..5330649a 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -5,6 +5,7 @@ from TTS.trainer import Trainer, TrainingArgs from TTS.tts.datasets import load_tts_samples from TTS.tts.models import setup_model from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.languages import LanguageManager from TTS.utils.audio import AudioProcessor @@ -60,8 +61,17 @@ def main(): else: speaker_manager = None + if hasattr(config, "use_language_embedding") and config.use_language_embedding: + language_manager = LanguageManager(config=config) + if hasattr(config, "model_args"): + config.model_args.num_languages = language_manager.num_languages + else: + config.num_languages = language_manager.num_languages + else: + language_manager = None + # init the model from config - model = setup_model(config, speaker_manager) + model = setup_model(config, speaker_manager, language_manager) # init the trainer and 🚀 trainer = Trainer( diff --git a/TTS/trainer.py b/TTS/trainer.py index b9026c8e..7bffb386 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -260,22 +260,6 @@ class Trainer: else: self.run_get_model(self.config, get_model) - if hasattr(self.model, "init_multilingual"): - self.model.init_multilingual(self.config, self.train_samples + self.eval_samples) - config = self.config.model_args if hasattr(self.config, "model_args") else self.config - # save speakers json - if config.use_language_embedding and self.model.language_manager.num_languages > 1: - self.model.language_manager.save_language_ids_to_file( - os.path.join(self.output_path, "language_ids.json") - ) - if hasattr(self.config, "model_args"): - self.config.model_args["num_languages"] = self.model.language_manager.num_languages - else: - self.config.num_languages = self.model.language_manager.num_languages - - # update config file - copy_model_files(self.config, self.output_path) - # setup criterion self.criterion = self.get_criterion(self.model) diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index 178992a7..32a69bca 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -85,6 +85,12 @@ class VitsConfig(BaseTTSConfig): test_sentences (List[List]): List of sentences with speaker and language information to be used for testing. + language_ids_file (str): + Path to the language ids file. + + use_language_embedding (bool): + If true, language embedding is used. Defaults to `False`. + Note: Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters. @@ -147,6 +153,8 @@ class VitsConfig(BaseTTSConfig): use_speaker_embedding: bool = False speakers_file: str = None speaker_embedding_channels: int = 256 + language_ids_file: str = None + use_language_embedding: bool = False # use d-vectors use_d_vector_file: bool = False diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py index 780f22cd..acd89110 100644 --- a/TTS/tts/models/__init__.py +++ b/TTS/tts/models/__init__.py @@ -2,7 +2,11 @@ from TTS.tts.utils.text.symbols import make_symbols, parse_symbols from TTS.utils.generic_utils import find_module -def setup_model(config, speaker_manager: "SpeakerManager" = None): +def setup_model( + config, + speaker_manager: "SpeakerManager" = None, + language_manager: "LanguageManager" = None + ): print(" > Using model: {}".format(config.model)) # fetch the right model implementation. if "base_model" in config and config["base_model"] is not None: @@ -31,7 +35,10 @@ def setup_model(config, speaker_manager: "SpeakerManager" = None): config.model_params.num_chars = num_chars if "model_args" in config: config.model_args.num_chars = num_chars - model = MyModel(config, speaker_manager=speaker_manager) + if config.model.lower() in ["vits"]: # If model supports multiple languages + model = MyModel(config, speaker_manager=speaker_manager, language_manager=language_manager) + else: + model = MyModel(config, speaker_manager=speaker_manager) return model diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 707fc9c3..14bc9180 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -419,8 +419,7 @@ class BaseTTS(BaseModel): return test_figures, test_audios def on_init_start(self, trainer): - """Save the speaker.json at the beginning of the training. And update the config.json with the - speakers.json file path.""" + """Save the speaker.json and language_ids.json at the beginning of the training. Also update both paths.""" if self.speaker_manager is not None: output_path = os.path.join(trainer.output_path, "speakers.json") self.speaker_manager.save_speaker_ids_to_file(output_path) @@ -431,3 +430,13 @@ class BaseTTS(BaseModel): trainer.config.save_json(os.path.join(trainer.output_path, "config.json")) print(f" > `speakers.json` is saved to {output_path}.") print(" > `speakers_file` is updated in the config.json.") + + if hasattr(self, "language_manager") and self.language_manager is not None: + output_path = os.path.join(trainer.output_path, "language_ids.json") + self.language_manager.save_language_ids_to_file(output_path) + trainer.config.language_ids_file = output_path + if hasattr(trainer.config, "model_args"): + trainer.config.model_args.language_ids_file = output_path + trainer.config.save_json(os.path.join(trainer.output_path, "config.json")) + print(f" > `language_ids.json` is saved to {output_path}.") + print(" > `language_ids_file` is updated in the config.json.") diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index c1c29980..ca110eb0 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -16,8 +16,8 @@ from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlock from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask -from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.languages import LanguageManager +from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.visual import plot_alignment from TTS.utils.trainer_utils import get_optimizer, get_scheduler @@ -158,6 +158,9 @@ class VitsArgs(Coqpit): num_languages (int): Number of languages for the language embedding layer. Defaults to 0. + language_ids_file (str): + Path to the language mapping file for the Language Manager. Defaults to None. + use_speaker_encoder_as_loss (bool): Enable/Disable Speaker Consistency Loss (SCL). Defaults to False. @@ -225,6 +228,7 @@ class VitsArgs(Coqpit): use_language_embedding: bool = False embedded_language_dim: int = 4 num_languages: int = 0 + language_ids_file: str = None use_speaker_encoder_as_loss: bool = False speaker_encoder_config_path: str = "" speaker_encoder_model_path: str = "" @@ -265,13 +269,18 @@ class Vits(BaseTTS): # pylint: disable=dangerous-default-value - def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None): + def __init__( + self, + config: Coqpit, + speaker_manager: SpeakerManager = None, + language_manager: LanguageManager = None, + ): super().__init__(config) self.END2END = True self.speaker_manager = speaker_manager - self.audio_config = config["audio"] + self.language_manager = language_manager if config.__class__.__name__ == "VitsConfig": # loading from VitsConfig if "num_chars" not in config: @@ -443,43 +452,20 @@ class Vits(BaseTTS): self.speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file) self.embedded_speaker_dim = config.d_vector_dim - if config.use_speaker_encoder_as_loss: - if not config.speaker_encoder_model_path or not config.speaker_encoder_config_path: - raise RuntimeError(" [!] To use the speaker encoder loss you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!") - self.speaker_manager.init_speaker_encoder(config.speaker_encoder_model_path, config.speaker_encoder_config_path) - self.speaker_encoder = self.speaker_manager.speaker_encoder.train() - for param in self.speaker_encoder.parameters(): - param.requires_grad = False - - print(" > External Speaker Encoder Loaded !!") - - if hasattr(self.speaker_encoder, "audio_config") and self.audio_config["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"]: - self.audio_transform = torchaudio.transforms.Resample(orig_freq=self.audio_config["sample_rate"], new_freq=self.speaker_encoder.audio_config["sample_rate"]) - else: - self.audio_transform = None - else: - self.audio_transform = None - self.speaker_encoder = None - - def init_multilingual(self, config: Coqpit, data: List = None): + def init_multilingual(self, config: Coqpit): """Initialize multilingual modules of a model. Args: config (Coqpit): Model configuration. - data (List, optional): Dataset items to infer number of speakers. Defaults to None. """ if hasattr(config, "model_args"): config = config.model_args - # init language manager - self.language_manager = LanguageManager(config, data=data) - # init language embedding layer - if config.use_language_embedding: - if config.num_languages > 0 and self.language_manager.num_languages == 0: - self.num_languages = config.num_languages - else: - self.num_languages = self.language_manager.num_languages + if config.language_ids_file is not None: + self.language_manager = LanguageManager(language_ids_file_path=config.language_ids_file) + if config.use_language_embedding and self.language_manager: + self.num_languages = self.language_manager.num_languages self.embedded_language_dim = config.embedded_language_dim self.emb_l = nn.Embedding(self.num_languages, self.embedded_language_dim) torch.nn.init.xavier_uniform_(self.emb_l.weight) diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py index 5bacc259..451b10f9 100644 --- a/TTS/tts/utils/languages.py +++ b/TTS/tts/utils/languages.py @@ -1,6 +1,6 @@ import json import os -from typing import Dict, List, Tuple +from typing import Dict, List import fsspec import numpy as np @@ -14,11 +14,13 @@ class LanguageManager: in a way that can be queried by language. Args: - language_id_file_path (str, optional): Path to the metafile that maps language names to ids used by + language_ids_file_path (str, optional): Path to the metafile that maps language names to ids used by TTS models. Defaults to "". + config (Coqpit, optional): Coqpit config that contains the language information in the datasets filed. + Defaults to None. Examples: - >>> manager = LanguageManager(language_id_file_path=language_id_file_path) + >>> manager = LanguageManager(language_ids_file_path=language_ids_file_path) >>> language_id_mapper = manager.language_ids """ @@ -26,10 +28,14 @@ class LanguageManager: def __init__( self, - language_id_file_path: str = "", + language_ids_file_path: str = "", + config: Coqpit = None, ): - if language_id_file_path: - self.set_language_ids_from_file(language_id_file_path) + if language_ids_file_path: + self.set_language_ids_from_file(language_ids_file_path) + + if config: + self.set_language_ids_from_config(config) @staticmethod def _load_json(json_file_path: str) -> Dict: @@ -50,27 +56,30 @@ class LanguageManager: return list(self.language_id_mapping.keys()) @staticmethod - def parse_languages_from_data(items: list) -> Tuple[Dict, int]: - """Parse language IDs from data samples retured by `load_meta_data()`. + def parse_language_ids_from_config(c: Coqpit) -> Dict: + """Set language id from config. Args: - items (list): Data sampled returned by `load_meta_data()`. + c (Coqpit): Config Returns: - Tuple[Dict, int]: language IDs and number of languages. + Tuple[Dict, int]: Language ID mapping and the number of languages. """ - languages = sorted({item[3] for item in items}) - language_ids = {name: i for i, name in enumerate(languages)} - num_languages = len(language_ids) - return language_ids, num_languages + languages = set({}) + for dataset in c.datasets: + if "language" in dataset: + languages.add(dataset["language"]) + else: + raise ValueError(f"Dataset {dataset['name']} has no language specified.") + return {name: i for i, name in enumerate(sorted(list(languages)))} - def set_language_ids_from_data(self, items: List) -> None: - """Set language IDs from data samples. + def set_language_ids_from_config(self, c: Coqpit) -> None: + """Set language IDs from config samples. Args: items (List): Data sampled returned by `load_meta_data()`. """ - self.language_id_mapping, _ = self.parse_languages_from_data(items) + self.language_id_mapping = self.parse_language_ids_from_config(c) def set_language_ids_from_file(self, file_path: str) -> None: """Load language ids from a json file. @@ -102,36 +111,6 @@ def _set_file_path(path): return None -def get_language_manager(c: Coqpit, data: List = None, restore_path: str = None) -> LanguageManager: - """Initiate a `LanguageManager` instance by the provided config. - - Args: - c (Coqpit): Model configuration. - restore_path (str): Path to a previous training folder. - data (List): Data sampled returned by `load_meta_data()`. Defaults to None. - out_path (str, optional): Save the generated language IDs to a output path. Defaults to None. - - Returns: - SpeakerManager: initialized and ready to use instance. - """ - language_manager = LanguageManager() - if c.use_language_embedding: - if data is not None: - language_manager.set_language_ids_from_data(data) - if restore_path: - language_file = _set_file_path(restore_path) - # restoring language manager from a previous run. - if language_file: - language_manager.set_language_ids_from_file(language_file) - if language_manager.num_languages > 0: - print( - " > Language manager is loaded with {} languages: {}".format( - language_manager.num_languages, ", ".join(language_manager.language_names) - ) - ) - return language_manager - - def get_language_weighted_sampler(items: list): language_names = np.array([item[3] for item in items]) unique_language_names = np.unique(language_names).tolist() From 4001322e50ac06f0dd6933b43854e69fdf6de1ec Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Sat, 27 Nov 2021 23:41:55 +0100 Subject: [PATCH 081/220] Fix trailing space --- TTS/tts/configs/vits_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index 32a69bca..36c948af 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -87,7 +87,7 @@ class VitsConfig(BaseTTSConfig): language_ids_file (str): Path to the language ids file. - + use_language_embedding (bool): If true, language embedding is used. Defaults to `False`. From 0f64d45e04f254945d64886cd0e87daa20d862bd Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Sun, 28 Nov 2021 00:23:55 +0100 Subject: [PATCH 082/220] Revert init multispeaker change --- TTS/tts/models/base_tts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 14bc9180..f1fdbd33 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -102,7 +102,7 @@ class BaseTTS(BaseModel): config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512 ) # init speaker embedding layer - if config.use_speaker_embedding: + if config.use_speaker_embedding and not config.use_d_vector_file:: print(" > Init speaker_embedding layer.") self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) From 3f3505c1cafa9c906d29451dc817d8a271e36f12 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Sun, 28 Nov 2021 00:48:53 +0100 Subject: [PATCH 083/220] Prevent weighted sampler use when num_gpus > 1 --- TTS/tts/models/base_tts.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index f1fdbd33..1f92bfc7 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -102,7 +102,7 @@ class BaseTTS(BaseModel): config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512 ) # init speaker embedding layer - if config.use_speaker_embedding and not config.use_d_vector_file:: + if config.use_speaker_embedding and not config.use_d_vector_file: print(" > Init speaker_embedding layer.") self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) @@ -337,8 +337,15 @@ class BaseTTS(BaseModel): if config.compute_f0: dataset.pitch_extractor.load_pitch_stats(config.get("f0_cache_path", None)) + + # sampler for DDP sampler = DistributedSampler(dataset) if num_gpus > 1 else None + + # Weighted samplers + assert not (num_gpus > 1 and getattr(config, "use_language_weighted_sampler", False)), "language_weighted_sampler is not supported with DistributedSampler" + assert not (num_gpus > 1 and getattr(config, "use_speaker_weighted_sampler", False)), "speaker_weighted_sampler is not supported with DistributedSampler" + if sampler is None: if getattr(config, "use_language_weighted_sampler", False): print(" > Using Language weighted sampler") From 22c7be5f442320435c511774da9cf392784c53b2 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 1 Dec 2021 23:36:29 +0100 Subject: [PATCH 084/220] Add test for language_weighted_sampler --- tests/data_tests/test_samplers.py | 53 +++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 tests/data_tests/test_samplers.py diff --git a/tests/data_tests/test_samplers.py b/tests/data_tests/test_samplers.py new file mode 100644 index 00000000..11e9082f --- /dev/null +++ b/tests/data_tests/test_samplers.py @@ -0,0 +1,53 @@ +from torch.utils.data import RandomSampler +from TTS.tts.datasets import load_tts_samples +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.utils.languages import get_language_weighted_sampler + +import functools + +dataset_config_en = BaseDatasetConfig( + name="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="en", +) + +dataset_config_pt = BaseDatasetConfig( + name="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="pt-br", +) + +# Adding the EN samples twice to create an unbalanced dataset +train_samples, eval_samples = load_tts_samples( + [dataset_config_en, dataset_config_en, dataset_config_pt], + eval_split=True +) + +def is_balanced(lang_1, lang_2): + return 0.9 < lang_1/lang_2 < 1.1 + +random_sampler = RandomSampler(train_samples) +ids = functools.reduce(lambda a, b: a + b, [list(random_sampler) for i in range(100)]) +en, pt = 0, 0 +for id in ids: + if train_samples[id][3] == 'en': + en += 1 + else: + pt += 1 + +assert not is_balanced(en, pt), "Random sampler is supposed to be unbalanced" + +weighted_sampler = get_language_weighted_sampler(train_samples) +ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)]) +en, pt = 0, 0 +for id in ids: + if train_samples[id][3] == 'en': + en += 1 + else: + pt += 1 + +assert is_balanced(en, pt), "Weighted sampler is supposed to be balanced" \ No newline at end of file From 7b81c1643402e85fec2950703ac43365279f963f Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 1 Dec 2021 23:48:38 +0100 Subject: [PATCH 085/220] Fix seed in test_samplers to avoid random fails --- tests/data_tests/test_samplers.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/data_tests/test_samplers.py b/tests/data_tests/test_samplers.py index 11e9082f..5e4e4151 100644 --- a/tests/data_tests/test_samplers.py +++ b/tests/data_tests/test_samplers.py @@ -1,10 +1,12 @@ -from torch.utils.data import RandomSampler from TTS.tts.datasets import load_tts_samples from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.utils.languages import get_language_weighted_sampler - +import torch import functools +# Fixing random state to avoid random fails +torch.manual_seed(0) + dataset_config_en = BaseDatasetConfig( name="ljspeech", meta_file_train="metadata.csv", @@ -23,18 +25,18 @@ dataset_config_pt = BaseDatasetConfig( # Adding the EN samples twice to create an unbalanced dataset train_samples, eval_samples = load_tts_samples( - [dataset_config_en, dataset_config_en, dataset_config_pt], + [dataset_config_en, dataset_config_en, dataset_config_pt], eval_split=True ) def is_balanced(lang_1, lang_2): - return 0.9 < lang_1/lang_2 < 1.1 + return 0.85 < lang_1/lang_2 < 1.2 -random_sampler = RandomSampler(train_samples) +random_sampler = torch.utils.data.RandomSampler(train_samples) ids = functools.reduce(lambda a, b: a + b, [list(random_sampler) for i in range(100)]) en, pt = 0, 0 -for id in ids: - if train_samples[id][3] == 'en': +for index in ids: + if train_samples[index][3] == 'en': en += 1 else: pt += 1 @@ -44,10 +46,10 @@ assert not is_balanced(en, pt), "Random sampler is supposed to be unbalanced" weighted_sampler = get_language_weighted_sampler(train_samples) ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)]) en, pt = 0, 0 -for id in ids: - if train_samples[id][3] == 'en': +for index in ids: + if train_samples[index][3] == 'en': en += 1 else: pt += 1 -assert is_balanced(en, pt), "Weighted sampler is supposed to be balanced" \ No newline at end of file +assert is_balanced(en, pt), "Weighted sampler is supposed to be balanced" From 4706583452bba8b5b79dad5266d4739d608e2a02 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 8 Dec 2021 19:34:36 +0100 Subject: [PATCH 086/220] Add support for multi-lingual models in CLI --- TTS/bin/synthesize.py | 30 +++++++++++++++++++-- TTS/tts/utils/languages.py | 1 + TTS/utils/synthesizer.py | 54 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 81 insertions(+), 4 deletions(-) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index fb2e41b4..07a9adb8 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -148,12 +148,19 @@ def main(): # args for multi-speaker synthesis parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) + parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None) parser.add_argument( "--speaker_idx", type=str, help="Target speaker ID for a multi-speaker TTS model.", default=None, ) + parser.add_argument( + "--language_idx", + type=str, + help="Target language ID for a multi-lingual TTS model.", + default=None, + ) parser.add_argument( "--speaker_wav", nargs="+", @@ -169,6 +176,14 @@ def main(): const=True, default=False, ) + parser.add_argument( + "--list_language_idxs", + help="List available language ids for the defined multi-lingual model.", + type=str2bool, + nargs="?", + const=True, + default=False, + ) # aux args parser.add_argument( "--save_spectogram", @@ -180,7 +195,7 @@ def main(): args = parser.parse_args() # print the description if either text or list_models is not set - if args.text is None and not args.list_models and not args.list_speaker_idxs: + if args.text is None and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs: parser.parse_args(["-h"]) # load model manager @@ -190,6 +205,7 @@ def main(): model_path = None config_path = None speakers_file_path = None + language_ids_file_path = None vocoder_path = None vocoder_config_path = None encoder_path = None @@ -213,6 +229,7 @@ def main(): model_path = args.model_path config_path = args.config_path speakers_file_path = args.speakers_file_path + language_ids_file_path = args.language_ids_file_path if args.vocoder_path is not None: vocoder_path = args.vocoder_path @@ -227,6 +244,7 @@ def main(): model_path, config_path, speakers_file_path, + language_ids_file_path, vocoder_path, vocoder_config_path, encoder_path, @@ -242,6 +260,14 @@ def main(): print(synthesizer.tts_model.speaker_manager.speaker_ids) return + # query langauge ids of a multi-lingual model. + if args.list_language_idxs: + print( + " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." + ) + print(synthesizer.tts_model.language_manager.language_id_mapping) + return + # check the arguments against a multi-speaker model. if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav): print( @@ -254,7 +280,7 @@ def main(): print(" > Text: {}".format(args.text)) # kick it - wav = synthesizer.tts(args.text, args.speaker_idx, args.speaker_wav, args.gst_style) + wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav) # save the results print(" > Saving output to {}".format(args.out_path)) diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py index 451b10f9..fc7eec57 100644 --- a/TTS/tts/utils/languages.py +++ b/TTS/tts/utils/languages.py @@ -31,6 +31,7 @@ class LanguageManager: language_ids_file_path: str = "", config: Coqpit = None, ): + self.language_id_mapping = {} if language_ids_file_path: self.set_language_ids_from_file(language_ids_file_path) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 043c4982..ea8ce6d1 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -8,6 +8,7 @@ import torch from TTS.config import load_config from TTS.tts.models import setup_model as setup_tts_model from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.languages import LanguageManager # pylint: disable=unused-wildcard-import # pylint: disable=wildcard-import @@ -23,6 +24,7 @@ class Synthesizer(object): tts_checkpoint: str, tts_config_path: str, tts_speakers_file: str = "", + tts_languages_file: str = "", vocoder_checkpoint: str = "", vocoder_config: str = "", encoder_checkpoint: str = "", @@ -52,6 +54,7 @@ class Synthesizer(object): self.tts_checkpoint = tts_checkpoint self.tts_config_path = tts_config_path self.tts_speakers_file = tts_speakers_file + self.tts_languages_file = tts_languages_file self.vocoder_checkpoint = vocoder_checkpoint self.vocoder_config = vocoder_config self.encoder_checkpoint = encoder_checkpoint @@ -63,6 +66,9 @@ class Synthesizer(object): self.speaker_manager = None self.num_speakers = 0 self.tts_speakers = {} + self.language_manager = None + self.num_languages = 0 + self.tts_languages = {} self.d_vector_dim = 0 self.seg = self._get_segmenter("en") self.use_cuda = use_cuda @@ -110,8 +116,13 @@ class Synthesizer(object): self.ap = AudioProcessor(verbose=False, **self.tts_config.audio) speaker_manager = self._init_speaker_manager() + language_manager = self._init_language_manager() - self.tts_model = setup_tts_model(config=self.tts_config, speaker_manager=speaker_manager) + self.tts_model = setup_tts_model( + config=self.tts_config, + speaker_manager=speaker_manager, + language_manager=language_manager, + ) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda() @@ -133,6 +144,17 @@ class Synthesizer(object): speaker_manager = SpeakerManager(d_vectors_file_path=self.tts_config.d_vector_file) return speaker_manager + def _init_language_manager(self): + """Initialize the LanguageManager""" + # setup if multi-lingual settings are in the global model config + language_manager = None + if hasattr(self.tts_config, "use_language_embedding") and self.tts_config.use_language_embedding is True: + if self.tts_languages_file: + language_manager = LanguageManager(language_ids_file_path=self.tts_languages_file) + elif self.tts_config.get("language_ids_file", None): + language_manager = LanguageManager(language_ids_file_path=self.tts_config.language_ids_file) + return language_manager + def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None: """Load the vocoder model. @@ -174,12 +196,20 @@ class Synthesizer(object): wav = np.array(wav) self.ap.save_wav(wav, path, self.output_sample_rate) - def tts(self, text: str, speaker_idx: str = "", speaker_wav=None, style_wav=None) -> List[int]: + def tts( + self, + text: str, + speaker_idx: str = "", + language_idx: str = "", + speaker_wav=None, + style_wav=None + ) -> List[int]: """🐸 TTS magic. Run all the models and generate speech. Args: text (str): input text. speaker_idx (str, optional): spekaer id for multi-speaker models. Defaults to "". + language_idx (str, optional): language id for multi-language models. Defaults to "". speaker_wav (): style_wav ([type], optional): style waveform for GST. Defaults to None. @@ -219,6 +249,24 @@ class Synthesizer(object): "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. " ) + # handle multi-lingaul + language_id = None + if self.tts_languages_file or hasattr(self.tts_model.language_manager, "language_id_mapping"): + if language_idx and isinstance(language_idx, str): + language_id = self.tts_model.language_manager.language_id_mapping[language_idx] + + elif not language_idx: + raise ValueError( + " [!] Look like you use a multi-lingual model. " + "You need to define either a `language_idx` or a `style_wav` to use a multi-lingual model." + ) + + else: + raise ValueError( + f" [!] Missing language_ids.json file path for selecting language {language_idx}." + "Define path for language_ids.json if it is a multi-lingual model or remove defined language idx. " + ) + # compute a new d_vector from the given clip. if speaker_wav is not None: speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip(speaker_wav) @@ -234,6 +282,8 @@ class Synthesizer(object): use_cuda=self.use_cuda, ap=self.ap, speaker_id=speaker_id, + language_id=language_id, + language_name=language_idx, style_wav=style_wav, enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars, use_griffin_lim=use_gl, From 846b4a14c648af31e96b843167f2c91783385b43 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 8 Dec 2021 19:42:45 +0100 Subject: [PATCH 087/220] Add recipe for multi-lingual VITS --- .../multilingual/vits_tts/train_vits_tts.py | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 recipes/multilingual/vits_tts/train_vits_tts.py diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py new file mode 100644 index 00000000..6beaef38 --- /dev/null +++ b/recipes/multilingual/vits_tts/train_vits_tts.py @@ -0,0 +1,117 @@ +import os +from glob import glob + +from TTS.config.shared_configs import BaseAudioConfig +from TTS.trainer import Trainer, TrainingArgs +from TTS.tts.configs.shared_configs import BaseDatasetConfig +from TTS.tts.configs.vits_config import VitsConfig +from TTS.tts.datasets import load_tts_samples +from TTS.tts.models.vits import Vits, VitsArgs +from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.languages import LanguageManager +from TTS.utils.audio import AudioProcessor + +output_path = os.path.dirname(os.path.abspath(__file__)) + +mailabs_path = '/home/julian/workspace/mailabs/**' +dataset_paths = glob(mailabs_path) +dataset_config = [BaseDatasetConfig(name="mailabs", meta_file_train=None, path=path, language=path.split('/')[-1]) for path in dataset_paths] + +audio_config = BaseAudioConfig( + sample_rate=16000, + win_length=1024, + hop_length=256, + num_mels=80, + preemphasis=0.0, + ref_level_db=20, + log_func="np.log", + do_trim_silence=False, + trim_db=23.0, + mel_fmin=0, + mel_fmax=None, + spec_gain=1.0, + signal_norm=True, + do_amp_to_db_linear=False, + resample=False, +) + +vitsArgs = VitsArgs( + use_language_embedding=True, + embedded_language_dim=4, + use_speaker_embedding=True, + use_sdp=False, +) + +config = VitsConfig( + model_args=vitsArgs, + audio=audio_config, + run_name="vits_vctk", + use_speaker_embedding=True, + batch_size=32, + eval_batch_size=16, + batch_group_size=0, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + text_cleaner="multilingual_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + compute_input_seq_cache=True, + print_step=25, + use_language_weighted_sampler= True, + print_eval=False, + mixed_precision=False, + sort_by_audio_len=True, + min_seq_len=32 * 256 * 4, + max_seq_len=160000, + output_path=output_path, + datasets=dataset_config, + characters= { + "pad": "_", + "eos": "&", + "bos": "*", + "characters": "!¡'(),-.:;¿?abcdefghijklmnopqrstuvwxyzµßàáâäåæçèéêëìíîïñòóôöùúûüąćęłńœśşźżƒабвгдежзийклмнопрстуфхцчшщъыьэюяёєіїґӧ «°±µ»$%&‘’‚“`”„", + "punctuations": "!¡'(),-.:;¿? ", + "phonemes": None, + "unique": True + }, + test_sentences=[ + ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", 'mary_ann', None, 'en_US'], + ["Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.", "ezwa", None, 'fr_FR'], + ["Ich finde, dieses Startup ist wirklich unglaublich.", "eva_k", None, 'de_DE'], + ["Я думаю, что этот стартап действительно удивительный.", "oblomov", None, 'ru_RU'], + ] +) + +# init audio processor +ap = AudioProcessor(**config.audio.to_dict()) + +# load training samples +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) + +# init speaker manager for multi-speaker training +# it maps speaker-id to speaker-name in the model and data-loader +speaker_manager = SpeakerManager() +speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +config.model_args.num_speakers = speaker_manager.num_speakers + +language_manager = LanguageManager(config=config) +config.model_args.num_languages = language_manager.num_languages + +# init model +model = Vits(config, speaker_manager, language_manager) + +# init the trainer and 🚀 +trainer = Trainer( + TrainingArgs(), + config, + output_path, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, + training_assets={"audio_processor": ap}, +) +trainer.fit() From b4bb0ace70b60061de6c52406e2bb1e8f7d207a6 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Thu, 9 Dec 2021 12:42:38 +0100 Subject: [PATCH 088/220] Fix zoo tests --- TTS/utils/synthesizer.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index ea8ce6d1..e6df6561 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -118,11 +118,14 @@ class Synthesizer(object): speaker_manager = self._init_speaker_manager() language_manager = self._init_language_manager() - self.tts_model = setup_tts_model( - config=self.tts_config, - speaker_manager=speaker_manager, - language_manager=language_manager, - ) + if language_manager is not None: + self.tts_model = setup_tts_model( + config=self.tts_config, + speaker_manager=speaker_manager, + language_manager=language_manager, + ) + else: + self.tts_model = setup_tts_model(config=self.tts_config, speaker_manager=speaker_manager) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda() @@ -251,7 +254,7 @@ class Synthesizer(object): # handle multi-lingaul language_id = None - if self.tts_languages_file or hasattr(self.tts_model.language_manager, "language_id_mapping"): + if self.tts_languages_file or (hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None): if language_idx and isinstance(language_idx, str): language_id = self.tts_model.language_manager.language_id_mapping[language_idx] From 66b6e9bc99b7f43d54b3da06c2419a4d70aefe93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 10 Dec 2021 07:53:10 +0000 Subject: [PATCH 089/220] Make style --- TTS/bin/extract_tts_spectrograms.py | 1 - TTS/bin/find_unique_phonemes.py | 1 + TTS/bin/remove_silence_using_vad.py | 14 ++++---- TTS/bin/train_tts.py | 2 +- TTS/server/server.py | 2 +- TTS/speaker_encoder/models/resnet.py | 11 +++--- TTS/tts/datasets/dataset.py | 2 +- TTS/tts/layers/losses.py | 2 +- TTS/tts/models/__init__.py | 8 ++--- TTS/tts/models/base_tts.py | 21 +++++++---- TTS/tts/models/vits.py | 7 ++-- TTS/utils/audio.py | 10 ++++-- TTS/utils/synthesizer.py | 13 +++---- TTS/utils/vad.py | 10 +++--- .../multilingual/vits_tts/train_vits_tts.py | 35 +++++++++++++------ requirements.txt | 1 + tests/aux_tests/test_find_unique_phonemes.py | 10 ++---- tests/data_tests/test_samplers.py | 21 ++++++----- 18 files changed, 96 insertions(+), 75 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 014ba4e8..7b489fd6 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -242,7 +242,6 @@ def main(args): # pylint: disable=redefined-outer-name else: speaker_manager = None - # setup model model = setup_model(c) diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index 832ef082..d3143ca3 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -19,6 +19,7 @@ def compute_phonemes(item): return [] return list(set(ph)) + def main(): # pylint: disable=W0601 global c diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py index a32f0f45..9070f2da 100755 --- a/TTS/bin/remove_silence_using_vad.py +++ b/TTS/bin/remove_silence_using_vad.py @@ -1,12 +1,13 @@ -import os -import glob -import pathlib import argparse +import glob import multiprocessing +import os +import pathlib from tqdm.contrib.concurrent import process_map -from TTS.utils.vad import read_wave, write_wave, get_vad_speech_segments +from TTS.utils.vad import get_vad_speech_segments, read_wave, write_wave + def remove_silence(filepath): output_path = filepath.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, "")) @@ -69,10 +70,7 @@ if __name__ == "__main__": parser.add_argument( "-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir" ) - parser.add_argument("-f", "--force", - default=False, - action='store_true', - help='Force the replace of exists files') + parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files") parser.add_argument( "-g", "--glob", diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 5330649a..191cba00 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -4,8 +4,8 @@ from TTS.config import load_config, register_config from TTS.trainer import Trainer, TrainingArgs from TTS.tts.datasets import load_tts_samples from TTS.tts.models import setup_model -from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.languages import LanguageManager +from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor diff --git a/TTS/server/server.py b/TTS/server/server.py index c6d67141..f7bc79c4 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -100,7 +100,7 @@ if args.vocoder_path is not None: # load models synthesizer = Synthesizer( - model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, use_cuda=args.use_cuda + model_path, config_path, speakers_file_path, None, vocoder_path, vocoder_config_path, use_cuda=args.use_cuda ) use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and synthesizer.tts_model.num_speakers > 1 diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index 8f0a8809..7bd507fb 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -2,11 +2,12 @@ import numpy as np import torch from torch import nn +from TTS.utils.audio import TorchSTFT +from TTS.utils.io import load_fsspec + # import torchaudio -from TTS.utils.audio import TorchSTFT -from TTS.utils.io import load_fsspec class PreEmphasis(torch.nn.Module): @@ -126,16 +127,16 @@ class ResNetSpeakerEncoder(nn.Module): n_mels=audio_config["num_mels"], power=2.0, use_mel=True, - mel_norm=None + mel_norm=None, ), - '''torchaudio.transforms.MelSpectrogram( + """torchaudio.transforms.MelSpectrogram( sample_rate=audio_config["sample_rate"], n_fft=audio_config["fft_size"], win_length=audio_config["win_length"], hop_length=audio_config["hop_length"], window_fn=torch.hamming_window, n_mels=audio_config["num_mels"], - ),''' + ),""", ) else: self.torch_spec = None diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 000393ea..843cea58 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -531,7 +531,7 @@ class TTSDataset(Dataset): "waveform": wav_padded, "raw_text": batch["raw_text"], "pitch": pitch, - "language_ids": language_ids + "language_ids": language_ids, } raise TypeError( diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 9c219998..7de45041 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -588,7 +588,7 @@ class VitsGeneratorLoss(nn.Module): @staticmethod def cosine_similarity_loss(gt_spk_emb, syn_spk_emb): - l = - torch.nn.functional.cosine_similarity(gt_spk_emb, syn_spk_emb).mean() + l = -torch.nn.functional.cosine_similarity(gt_spk_emb, syn_spk_emb).mean() return l def forward( diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py index acd89110..4cc8b658 100644 --- a/TTS/tts/models/__init__.py +++ b/TTS/tts/models/__init__.py @@ -2,11 +2,7 @@ from TTS.tts.utils.text.symbols import make_symbols, parse_symbols from TTS.utils.generic_utils import find_module -def setup_model( - config, - speaker_manager: "SpeakerManager" = None, - language_manager: "LanguageManager" = None - ): +def setup_model(config, speaker_manager: "SpeakerManager" = None, language_manager: "LanguageManager" = None): print(" > Using model: {}".format(config.model)) # fetch the right model implementation. if "base_model" in config and config["base_model"] is not None: @@ -35,7 +31,7 @@ def setup_model( config.model_params.num_chars = num_chars if "model_args" in config: config.model_args.num_chars = num_chars - if config.model.lower() in ["vits"]: # If model supports multiple languages + if config.model.lower() in ["vits"]: # If model supports multiple languages model = MyModel(config, speaker_manager=speaker_manager, language_manager=language_manager) else: model = MyModel(config, speaker_manager=speaker_manager) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 1f92bfc7..e52cd765 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -12,8 +12,8 @@ from torch.utils.data.distributed import DistributedSampler from TTS.model import BaseModel from TTS.tts.configs.shared_configs import CharactersConfig from TTS.tts.datasets.dataset import TTSDataset -from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager, get_speaker_weighted_sampler from TTS.tts.utils.languages import LanguageManager, get_language_weighted_sampler +from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager, get_speaker_weighted_sampler from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text import make_symbols from TTS.tts.utils.visual import plot_alignment, plot_spectrogram @@ -150,7 +150,13 @@ class BaseTTS(BaseModel): if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None: language_id = self.language_manager.language_id_mapping[language_name] - return {"text": text, "speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector, "language_id": language_id} + return { + "text": text, + "speaker_id": speaker_id, + "style_wav": style_wav, + "d_vector": d_vector, + "language_id": language_id, + } def format_batch(self, batch: Dict) -> Dict: """Generic batch formatting for `TTSDataset`. @@ -337,14 +343,16 @@ class BaseTTS(BaseModel): if config.compute_f0: dataset.pitch_extractor.load_pitch_stats(config.get("f0_cache_path", None)) - - # sampler for DDP sampler = DistributedSampler(dataset) if num_gpus > 1 else None # Weighted samplers - assert not (num_gpus > 1 and getattr(config, "use_language_weighted_sampler", False)), "language_weighted_sampler is not supported with DistributedSampler" - assert not (num_gpus > 1 and getattr(config, "use_speaker_weighted_sampler", False)), "speaker_weighted_sampler is not supported with DistributedSampler" + assert not ( + num_gpus > 1 and getattr(config, "use_language_weighted_sampler", False) + ), "language_weighted_sampler is not supported with DistributedSampler" + assert not ( + num_gpus > 1 and getattr(config, "use_speaker_weighted_sampler", False) + ), "speaker_weighted_sampler is not supported with DistributedSampler" if sampler is None: if getattr(config, "use_language_weighted_sampler", False): @@ -354,7 +362,6 @@ class BaseTTS(BaseModel): print(" > Using Language weighted sampler") sampler = get_speaker_weighted_sampler(dataset.items) - loader = DataLoader( dataset, batch_size=config.eval_batch_size if is_eval else config.batch_size, diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index ca110eb0..5b4725b3 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -4,6 +4,7 @@ from itertools import chain from typing import Dict, List, Tuple import torch + # import torchaudio from coqpit import Coqpit from torch import nn @@ -420,8 +421,9 @@ class Vits(BaseTTS): ): # TODO: change this with torchaudio Resample raise RuntimeError( - ' [!] To use the speaker consistency loss (SCL) you need to have matching sample rates between the TTS model ({}) and the speaker encoder ({})!' - .format(self.config.audio["sample_rate"], self.speaker_encoder.audio_config["sample_rate"]) + " [!] To use the speaker consistency loss (SCL) you need to have matching sample rates between the TTS model ({}) and the speaker encoder ({})!".format( + self.config.audio["sample_rate"], self.speaker_encoder.audio_config["sample_rate"] + ) ) # pylint: disable=W0101,W0105 """ self.audio_transform = torchaudio.transforms.Resample( @@ -675,7 +677,6 @@ class Vits(BaseTTS): ) return outputs - def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}): """ Shapes: diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index 10c9ec7e..d01196c4 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -88,7 +88,7 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method spec_gain=1.0, power=None, use_htk=False, - mel_norm="slaney" + mel_norm="slaney", ): super().__init__() self.n_fft = n_fft @@ -155,7 +155,13 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method def _build_mel_basis(self): mel_basis = librosa.filters.mel( - self.sample_rate, self.n_fft, n_mels=self.n_mels, fmin=self.mel_fmin, fmax=self.mel_fmax, htk=self.use_htk, norm=self.mel_norm + self.sample_rate, + self.n_fft, + n_mels=self.n_mels, + fmin=self.mel_fmin, + fmax=self.mel_fmax, + htk=self.use_htk, + norm=self.mel_norm, ) self.mel_basis = torch.from_numpy(mel_basis).float() diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index e6df6561..d64c0936 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -7,8 +7,8 @@ import torch from TTS.config import load_config from TTS.tts.models import setup_model as setup_tts_model -from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.languages import LanguageManager +from TTS.tts.utils.speakers import SpeakerManager # pylint: disable=unused-wildcard-import # pylint: disable=wildcard-import @@ -200,12 +200,7 @@ class Synthesizer(object): self.ap.save_wav(wav, path, self.output_sample_rate) def tts( - self, - text: str, - speaker_idx: str = "", - language_idx: str = "", - speaker_wav=None, - style_wav=None + self, text: str, speaker_idx: str = "", language_idx: str = "", speaker_wav=None, style_wav=None ) -> List[int]: """🐸 TTS magic. Run all the models and generate speech. @@ -254,7 +249,9 @@ class Synthesizer(object): # handle multi-lingaul language_id = None - if self.tts_languages_file or (hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None): + if self.tts_languages_file or ( + hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None + ): if language_idx and isinstance(language_idx, str): language_id = self.tts_model.language_manager.language_id_mapping[language_idx] diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py index 33548087..923544d0 100644 --- a/TTS/utils/vad.py +++ b/TTS/utils/vad.py @@ -1,8 +1,9 @@ # This code is adpated from: https://github.com/wiseman/py-webrtcvad/blob/master/example.py -import wave -import webrtcvad -import contextlib import collections +import contextlib +import wave + +import webrtcvad def read_wave(path): @@ -37,7 +38,7 @@ class Frame(object): """Represents a "frame" of audio data.""" def __init__(self, _bytes, timestamp, duration): - self.bytes =_bytes + self.bytes = _bytes self.timestamp = timestamp self.duration = duration @@ -133,6 +134,7 @@ def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, fram if voiced_frames: yield b"".join([f.bytes for f in voiced_frames]) + def get_vad_speech_segments(audio, sample_rate, aggressiveness=2, padding_duration_ms=300): vad = webrtcvad.Vad(int(aggressiveness)) diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py index 6beaef38..be4747df 100644 --- a/recipes/multilingual/vits_tts/train_vits_tts.py +++ b/recipes/multilingual/vits_tts/train_vits_tts.py @@ -7,15 +7,18 @@ from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.vits import Vits, VitsArgs -from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.languages import LanguageManager +from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) -mailabs_path = '/home/julian/workspace/mailabs/**' +mailabs_path = "/home/julian/workspace/mailabs/**" dataset_paths = glob(mailabs_path) -dataset_config = [BaseDatasetConfig(name="mailabs", meta_file_train=None, path=path, language=path.split('/')[-1]) for path in dataset_paths] +dataset_config = [ + BaseDatasetConfig(name="mailabs", meta_file_train=None, path=path, language=path.split("/")[-1]) + for path in dataset_paths +] audio_config = BaseAudioConfig( sample_rate=16000, @@ -61,7 +64,7 @@ config = VitsConfig( phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), compute_input_seq_cache=True, print_step=25, - use_language_weighted_sampler= True, + use_language_weighted_sampler=True, print_eval=False, mixed_precision=False, sort_by_audio_len=True, @@ -69,21 +72,31 @@ config = VitsConfig( max_seq_len=160000, output_path=output_path, datasets=dataset_config, - characters= { + characters={ "pad": "_", "eos": "&", "bos": "*", "characters": "!¡'(),-.:;¿?abcdefghijklmnopqrstuvwxyzµßàáâäåæçèéêëìíîïñòóôöùúûüąćęłńœśşźżƒабвгдежзийклмнопрстуфхцчшщъыьэюяёєіїґӧ «°±µ»$%&‘’‚“`”„", "punctuations": "!¡'(),-.:;¿? ", "phonemes": None, - "unique": True + "unique": True, }, test_sentences=[ - ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", 'mary_ann', None, 'en_US'], - ["Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.", "ezwa", None, 'fr_FR'], - ["Ich finde, dieses Startup ist wirklich unglaublich.", "eva_k", None, 'de_DE'], - ["Я думаю, что этот стартап действительно удивительный.", "oblomov", None, 'ru_RU'], - ] + [ + "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + "mary_ann", + None, + "en_US", + ], + [ + "Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.", + "ezwa", + None, + "fr_FR", + ], + ["Ich finde, dieses Startup ist wirklich unglaublich.", "eva_k", None, "de_DE"], + ["Я думаю, что этот стартап действительно удивительный.", "oblomov", None, "ru_RU"], + ], ) # init audio processor diff --git a/requirements.txt b/requirements.txt index 3ec33ceb..453c3ec4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,3 +26,4 @@ unidic-lite==1.0.8 gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=2.0.0 fsspec>=2021.04.0 pyworld +webrtcvad diff --git a/tests/aux_tests/test_find_unique_phonemes.py b/tests/aux_tests/test_find_unique_phonemes.py index 33fad9ba..fa0abe4b 100644 --- a/tests/aux_tests/test_find_unique_phonemes.py +++ b/tests/aux_tests/test_find_unique_phonemes.py @@ -31,7 +31,7 @@ dataset_config_pt = BaseDatasetConfig( class TestFindUniquePhonemes(unittest.TestCase): @staticmethod def test_espeak_phonemes(): - # prepare the config + # prepare the config config = VitsConfig( batch_size=2, eval_batch_size=2, @@ -52,9 +52,7 @@ class TestFindUniquePhonemes(unittest.TestCase): config.save_json(config_path) # run test - run_cli( - f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"' - ) + run_cli(f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"') @staticmethod def test_no_espeak_phonemes(): @@ -79,6 +77,4 @@ class TestFindUniquePhonemes(unittest.TestCase): config.save_json(config_path) # run test - run_cli( - f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"' - ) + run_cli(f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"') diff --git a/tests/data_tests/test_samplers.py b/tests/data_tests/test_samplers.py index 5e4e4151..3d8d6c75 100644 --- a/tests/data_tests/test_samplers.py +++ b/tests/data_tests/test_samplers.py @@ -1,9 +1,11 @@ -from TTS.tts.datasets import load_tts_samples -from TTS.config.shared_configs import BaseDatasetConfig -from TTS.tts.utils.languages import get_language_weighted_sampler -import torch import functools +import torch + +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.datasets import load_tts_samples +from TTS.tts.utils.languages import get_language_weighted_sampler + # Fixing random state to avoid random fails torch.manual_seed(0) @@ -25,18 +27,19 @@ dataset_config_pt = BaseDatasetConfig( # Adding the EN samples twice to create an unbalanced dataset train_samples, eval_samples = load_tts_samples( - [dataset_config_en, dataset_config_en, dataset_config_pt], - eval_split=True + [dataset_config_en, dataset_config_en, dataset_config_pt], eval_split=True ) + def is_balanced(lang_1, lang_2): - return 0.85 < lang_1/lang_2 < 1.2 + return 0.85 < lang_1 / lang_2 < 1.2 + random_sampler = torch.utils.data.RandomSampler(train_samples) ids = functools.reduce(lambda a, b: a + b, [list(random_sampler) for i in range(100)]) en, pt = 0, 0 for index in ids: - if train_samples[index][3] == 'en': + if train_samples[index][3] == "en": en += 1 else: pt += 1 @@ -47,7 +50,7 @@ weighted_sampler = get_language_weighted_sampler(train_samples) ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)]) en, pt = 0, 0 for index in ids: - if train_samples[index][3] == 'en': + if train_samples[index][3] == "en": en += 1 else: pt += 1 From bbea9b3f9fde59df695953da9cc4fb07ee5c836c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 10 Dec 2021 07:53:19 +0000 Subject: [PATCH 090/220] Remove redundant code --- TTS/tts/models/vits.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 5b4725b3..7f83f452 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -830,15 +830,6 @@ class Vits(BaseTTS): gt_spk_emb=outputs["gt_spk_emb"], syn_spk_emb=outputs["syn_spk_emb"], ) - # ignore duration loss if fine tuning mode is on - if not self.args.fine_tuning_mode: - # handle the duration loss - if self.args.use_sdp: - loss_dict["nll_duration"] = outputs["nll_duration"] - loss_dict["loss"] += outputs["nll_duration"] - else: - loss_dict["loss_duration"] = outputs["loss_duration"] - loss_dict["loss"] += outputs["loss_duration"] elif optimizer_idx == 1: # discriminator pass From 84bbe0283925651798a3549a291a4ec77d6b7247 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 10 Dec 2021 09:12:03 +0000 Subject: [PATCH 091/220] =?UTF-8?q?Add=20=F0=9F=91=91YourTTS=20docs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/source/models/vits.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/source/models/vits.md b/docs/source/models/vits.md index 5c0e92f6..0c303f7a 100644 --- a/docs/source/models/vits.md +++ b/docs/source/models/vits.md @@ -3,10 +3,15 @@ VITS (Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech ) is an End-to-End (encoder -> vocoder together) TTS model that takes advantage of SOTA DL techniques like GANs, VAE, Normalizing Flows. It does not require external alignment annotations and learns the text-to-audio alignment -using MAS as explained in the paper. The model architecture is a combination of GlowTTS encoder and HiFiGAN vocoder. +using MAS, as explained in the paper. The model architecture is a combination of GlowTTS encoder and HiFiGAN vocoder. It is a feed-forward model with x67.12 real-time factor on a GPU. +🐸 YourTTS is a multi-speaker and multi-lingual TTS model that can perform voice conversion and zero-shot speaker adaptation. +It can also learn a new language or voice with a ~ 1 minute long audio clip. This is a big open gate for training +TTS models in low-resources languages. 🐸 YourTTS uses VITS as the backbone architecture coupled with a speaker encoder model. + ## Important resources & papers +- 🐸 YourTTS: https://arxiv.org/abs/2112.02418 - VITS: https://arxiv.org/pdf/2106.06103.pdf - Neural Spline Flows: https://arxiv.org/abs/1906.04032 - Variational Autoencoder: https://arxiv.org/pdf/1312.6114.pdf From 4b06e3e23252e224d03536bc596c81af563f92da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 13 Dec 2021 16:23:57 +0000 Subject: [PATCH 092/220] Use torchaudio for ResNet speaker encoder --- TTS/speaker_encoder/models/resnet.py | 40 +++++++++++++--------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index 7bd507fb..3b96f270 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -5,12 +5,10 @@ from torch import nn from TTS.utils.audio import TorchSTFT from TTS.utils.io import load_fsspec -# import torchaudio +import torchaudio - - -class PreEmphasis(torch.nn.Module): +class PreEmphasis(nn.Module): def __init__(self, coefficient=0.97): super().__init__() self.coefficient = coefficient @@ -114,29 +112,29 @@ class ResNetSpeakerEncoder(nn.Module): if self.use_torch_spec: self.torch_spec = torch.nn.Sequential( PreEmphasis(audio_config["preemphasis"]), - TorchSTFT( - n_fft=audio_config["fft_size"], - hop_length=audio_config["hop_length"], - win_length=audio_config["win_length"], - sample_rate=audio_config["sample_rate"], - window="hamming_window", - mel_fmin=0.0, - mel_fmax=None, - use_htk=True, - do_amp_to_db=False, - n_mels=audio_config["num_mels"], - power=2.0, - use_mel=True, - mel_norm=None, - ), - """torchaudio.transforms.MelSpectrogram( + # TorchSTFT( + # n_fft=audio_config["fft_size"], + # hop_length=audio_config["hop_length"], + # win_length=audio_config["win_length"], + # sample_rate=audio_config["sample_rate"], + # window="hamming_window", + # mel_fmin=0.0, + # mel_fmax=None, + # use_htk=True, + # do_amp_to_db=False, + # n_mels=audio_config["num_mels"], + # power=2.0, + # use_mel=True, + # mel_norm=None, + # ) + torchaudio.transforms.MelSpectrogram( sample_rate=audio_config["sample_rate"], n_fft=audio_config["fft_size"], win_length=audio_config["win_length"], hop_length=audio_config["hop_length"], window_fn=torch.hamming_window, n_mels=audio_config["num_mels"], - ),""", + ) ) else: self.torch_spec = None From f73573c21505a5ff3729bba781f5c76fa555cbac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 13 Dec 2021 16:27:36 +0000 Subject: [PATCH 093/220] Fix synthesizer reading `use_language_embedding` --- TTS/utils/synthesizer.py | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index d64c0936..bd90dd8c 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -151,7 +151,10 @@ class Synthesizer(object): """Initialize the LanguageManager""" # setup if multi-lingual settings are in the global model config language_manager = None - if hasattr(self.tts_config, "use_language_embedding") and self.tts_config.use_language_embedding is True: + if ( + hasattr(self.tts_config.model_args, "use_language_embedding") + and self.tts_config.model_args.use_language_embedding is True + ): if self.tts_languages_file: language_manager = LanguageManager(language_ids_file_path=self.tts_languages_file) elif self.tts_config.get("language_ids_file", None): @@ -200,14 +203,14 @@ class Synthesizer(object): self.ap.save_wav(wav, path, self.output_sample_rate) def tts( - self, text: str, speaker_idx: str = "", language_idx: str = "", speaker_wav=None, style_wav=None + self, text: str, speaker_name: str = "", language_name: str = "", speaker_wav=None, style_wav=None ) -> List[int]: """🐸 TTS magic. Run all the models and generate speech. Args: text (str): input text. - speaker_idx (str, optional): spekaer id for multi-speaker models. Defaults to "". - language_idx (str, optional): language id for multi-language models. Defaults to "". + speaker_name (str, optional): spekaer id for multi-speaker models. Defaults to "". + language_name (str, optional): language id for multi-language models. Defaults to "". speaker_wav (): style_wav ([type], optional): style waveform for GST. Defaults to None. @@ -224,26 +227,26 @@ class Synthesizer(object): speaker_embedding = None speaker_id = None if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"): - if speaker_idx and isinstance(speaker_idx, str): + if speaker_name and isinstance(speaker_name, str): if self.tts_config.use_d_vector_file: # get the speaker embedding from the saved d_vectors. - speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(speaker_idx)[0] + speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(speaker_name)[0] speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim] else: # get speaker idx from the speaker name - speaker_id = self.tts_model.speaker_manager.speaker_ids[speaker_idx] + speaker_id = self.tts_model.speaker_manager.speaker_ids[speaker_name] - elif not speaker_idx and not speaker_wav: + elif not speaker_name and not speaker_wav: raise ValueError( " [!] Look like you use a multi-speaker model. " - "You need to define either a `speaker_idx` or a `style_wav` to use a multi-speaker model." + "You need to define either a `speaker_name` or a `style_wav` to use a multi-speaker model." ) else: speaker_embedding = None else: - if speaker_idx: + if speaker_name: raise ValueError( - f" [!] Missing speakers.json file path for selecting speaker {speaker_idx}." + f" [!] Missing speakers.json file path for selecting speaker {speaker_name}." "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. " ) @@ -252,18 +255,18 @@ class Synthesizer(object): if self.tts_languages_file or ( hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None ): - if language_idx and isinstance(language_idx, str): - language_id = self.tts_model.language_manager.language_id_mapping[language_idx] + if language_name and isinstance(language_name, str): + language_id = self.tts_model.language_manager.language_id_mapping[language_name] - elif not language_idx: + elif not language_name: raise ValueError( " [!] Look like you use a multi-lingual model. " - "You need to define either a `language_idx` or a `style_wav` to use a multi-lingual model." + "You need to define either a `language_name` or a `style_wav` to use a multi-lingual model." ) else: raise ValueError( - f" [!] Missing language_ids.json file path for selecting language {language_idx}." + f" [!] Missing language_ids.json file path for selecting language {language_name}." "Define path for language_ids.json if it is a multi-lingual model or remove defined language idx. " ) @@ -283,7 +286,7 @@ class Synthesizer(object): ap=self.ap, speaker_id=speaker_id, language_id=language_id, - language_name=language_idx, + language_name=language_name, style_wav=style_wav, enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars, use_griffin_lim=use_gl, From 6d7199d5594b660919d6eff1b1def1d56e2d1122 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 13 Dec 2021 16:28:54 +0000 Subject: [PATCH 094/220] Rename setup_model to setup_speaker_encoder_model --- TTS/tts/utils/speakers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index c1eede3d..8f15aada 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -10,7 +10,7 @@ from coqpit import Coqpit from torch.utils.data.sampler import WeightedRandomSampler from TTS.config import load_config -from TTS.speaker_encoder.utils.generic_utils import setup_model +from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model from TTS.utils.audio import AudioProcessor @@ -252,7 +252,7 @@ class SpeakerManager: config_path (str): Model config file path. """ self.speaker_encoder_config = load_config(config_path) - self.speaker_encoder = setup_model(self.speaker_encoder_config) + self.speaker_encoder = setup_speaker_encoder_model(self.speaker_encoder_config) self.speaker_encoder.load_checkpoint(config_path, model_path, eval=True, use_cuda=self.use_cuda) self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio) # normalize the input audio level and trim silences From 6274d5e438086f4138e03e9cadc5ff208a6b0000 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 13 Dec 2021 16:29:19 +0000 Subject: [PATCH 095/220] Fixup --- TTS/speaker_encoder/utils/generic_utils.py | 24 +++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/speaker_encoder/utils/generic_utils.py index c926e215..dab79f3c 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/speaker_encoder/utils/generic_utils.py @@ -170,21 +170,21 @@ def to_camel(text): return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) -def setup_model(c): - if c.model_params["model_name"].lower() == "lstm": +def setup_speaker_encoder_model(config: "Coqpit"): + if config.model_params["model_name"].lower() == "lstm": model = LSTMSpeakerEncoder( - c.model_params["input_dim"], - c.model_params["proj_dim"], - c.model_params["lstm_dim"], - c.model_params["num_lstm_layers"], + config.model_params["input_dim"], + config.model_params["proj_dim"], + config.model_params["lstm_dim"], + config.model_params["num_lstm_layers"], ) - elif c.model_params["model_name"].lower() == "resnet": + elif config.model_params["model_name"].lower() == "resnet": model = ResNetSpeakerEncoder( - input_dim=c.model_params["input_dim"], - proj_dim=c.model_params["proj_dim"], - log_input=c.model_params.get("log_input", False), - use_torch_spec=c.model_params.get("use_torch_spec", False), - audio_config=c.audio, + input_dim=config.model_params["input_dim"], + proj_dim=config.model_params["proj_dim"], + log_input=config.model_params.get("log_input", False), + use_torch_spec=config.model_params.get("use_torch_spec", False), + audio_config=config.audio, ) return model From 9ec6238f4ad08d502ed37a27089c8e63d2ff5fc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 13 Dec 2021 16:30:15 +0000 Subject: [PATCH 096/220] Fixup --- TTS/speaker_encoder/models/resnet.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index 3b96f270..92d34494 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -1,12 +1,11 @@ import numpy as np import torch +import torchaudio from torch import nn from TTS.utils.audio import TorchSTFT from TTS.utils.io import load_fsspec -import torchaudio - class PreEmphasis(nn.Module): def __init__(self, coefficient=0.97): @@ -134,7 +133,7 @@ class ResNetSpeakerEncoder(nn.Module): hop_length=audio_config["hop_length"], window_fn=torch.hamming_window, n_mels=audio_config["num_mels"], - ) + ), ) else: self.torch_spec = None From abedfd586d5f78409b216a6b22ffbdcff1b6d77f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 16 Dec 2021 14:53:57 +0000 Subject: [PATCH 097/220] Add functions to get and check and argument in config and config.model_args --- TTS/bin/train_tts.py | 8 ++++---- TTS/config/__init__.py | 24 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 191cba00..3360a940 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -1,6 +1,6 @@ import os -from TTS.config import load_config, register_config +from TTS.config import check_config_and_model_args, get_from_config_or_model_args, load_config, register_config from TTS.trainer import Trainer, TrainingArgs from TTS.tts.datasets import load_tts_samples from TTS.tts.models import setup_model @@ -46,14 +46,14 @@ def main(): ap = AudioProcessor(**config.audio) # init speaker manager - if config.use_speaker_embedding: + if check_config_and_model_args(config, "use_speaker_embedding", True): speaker_manager = SpeakerManager(data_items=train_samples + eval_samples) if hasattr(config, "model_args"): config.model_args.num_speakers = speaker_manager.num_speakers else: config.num_speakers = speaker_manager.num_speakers - elif config.use_d_vector_file: - speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file) + elif check_config_and_model_args(config, "use_d_vector_file", True): + speaker_manager = SpeakerManager(d_vectors_file_path=get_from_config_or_model_args(config, "d_vector_file")) if hasattr(config, "model_args"): config.model_args.num_speakers = speaker_manager.num_speakers else: diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py index f626163f..65950de6 100644 --- a/TTS/config/__init__.py +++ b/TTS/config/__init__.py @@ -95,3 +95,27 @@ def load_config(config_path: str) -> None: config = config_class() config.from_dict(config_dict) return config + + +def check_config_and_model_args(config, arg_name, value): + """Check the give argument in `config.model_args` if exist or in `config` for + the given value. + + It is to patch up the compatibility between models with and without `model_args`. + + TODO: Remove this in the future with a unified approach. + """ + if hasattr(config, "model_args"): + if arg_name in config.model_args: + return config.model_args[arg_name] == value + if hasattr(config, arg_name): + return config[arg_name] == value + raise ValueError(f" [!] {arg_name} is not found in config or config.model_args") + + +def get_from_config_or_model_args(config, arg_name): + """Get the given argument from `config.model_args` if exist or in `config`.""" + if hasattr(config, "model_args"): + if arg_name in config.model_args: + return config.model_args[arg_name] + return config[arg_name] From 08a1cf3dcb07133c1d106a73b5a456595bdffd26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 16 Dec 2021 14:55:43 +0000 Subject: [PATCH 098/220] Change speaker_idx to speaker_name --- TTS/server/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/server/server.py b/TTS/server/server.py index f7bc79c4..2c6bebfd 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -165,7 +165,7 @@ def tts(): style_wav = style_wav_uri_to_dict(style_wav) print(" > Model input: {}".format(text)) - wavs = synthesizer.tts(text, speaker_idx=speaker_idx, style_wav=style_wav) + wavs = synthesizer.tts(text, speaker_name=speaker_idx, style_wav=style_wav) out = io.BytesIO() synthesizer.save_wav(wavs, out) return send_file(out, mimetype="audio/wav") From 1ddf245b08482bf7833941f2b929aff18e84e656 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 16 Dec 2021 14:56:34 +0000 Subject: [PATCH 099/220] Use speaker_encoder from speaker manager in Vits --- TTS/speaker_encoder/models/resnet.py | 3 +-- TTS/tts/models/vits.py | 31 +++++++++++----------------- 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index 92d34494..7a384ef5 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -3,7 +3,7 @@ import torch import torchaudio from torch import nn -from TTS.utils.audio import TorchSTFT +# from TTS.utils.audio import TorchSTFT from TTS.utils.io import load_fsspec @@ -258,7 +258,6 @@ class ResNetSpeakerEncoder(nn.Module): if return_mean: embeddings = torch.mean(embeddings, dim=0, keepdim=True) - return embeddings def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False): diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 7f83f452..ddf6800f 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -406,42 +406,32 @@ class Vits(BaseTTS): raise RuntimeError( " [!] To use the speaker consistency loss (SCL) you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!" ) - self.speaker_manager.init_speaker_encoder( - config.speaker_encoder_model_path, config.speaker_encoder_config_path - ) - self.speaker_encoder = self.speaker_manager.speaker_encoder.train() - for param in self.speaker_encoder.parameters(): - param.requires_grad = False + self.speaker_manager.speaker_encoder.eval() print(" > External Speaker Encoder Loaded !!") if ( - hasattr(self.speaker_encoder, "audio_config") - and self.config.audio["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"] + hasattr(self.speaker_manager.speaker_encoder, "audio_config") + and self.config.audio["sample_rate"] != self.speaker_manager.speaker_encoder.audio_config["sample_rate"] ): # TODO: change this with torchaudio Resample raise RuntimeError( " [!] To use the speaker consistency loss (SCL) you need to have matching sample rates between the TTS model ({}) and the speaker encoder ({})!".format( - self.config.audio["sample_rate"], self.speaker_encoder.audio_config["sample_rate"] + self.config.audio["sample_rate"], + self.speaker_manager.speaker_encoder.audio_config["sample_rate"], ) ) # pylint: disable=W0101,W0105 """ self.audio_transform = torchaudio.transforms.Resample( orig_freq=self.audio_config["sample_rate"], - new_freq=self.speaker_encoder.audio_config["sample_rate"], + new_freq=self.speaker_manager.speaker_encoder.audio_config["sample_rate"], ) else: self.audio_transform = None """ - else: - # self.audio_transform = None - self.speaker_encoder = None def _init_speaker_embedding(self, config): # pylint: disable=attribute-defined-outside-init - if config.speakers_file is not None: - self.speaker_manager = SpeakerManager(speaker_id_file_path=config.speakers_file) - if self.num_speakers > 0: print(" > initialization of speaker-embedding layers.") self.embedded_speaker_dim = config.speaker_embedding_channels @@ -451,7 +441,6 @@ class Vits(BaseTTS): # pylint: disable=attribute-defined-outside-init if hasattr(self, "emb_g"): raise ValueError("[!] Speaker embedding layer already initialized before d_vector settings.") - self.speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file) self.embedded_speaker_dim = config.d_vector_dim def init_multilingual(self, config: Coqpit): @@ -644,7 +633,7 @@ class Vits(BaseTTS): self.args.spec_segment_size * self.config.audio.hop_length, ) - if self.args.use_speaker_encoder_as_loss and self.speaker_encoder is not None: + if self.args.use_speaker_encoder_as_loss and self.speaker_manager.speaker_encoder is not None: # concate generated and GT waveforms wavs_batch = torch.cat((wav_seg, o), dim=0).squeeze(1) @@ -653,7 +642,7 @@ class Vits(BaseTTS): """if self.audio_transform is not None: wavs_batch = self.audio_transform(wavs_batch)""" - pred_embs = self.speaker_encoder.forward(wavs_batch, l2_norm=True) + pred_embs = self.speaker_manager.speaker_encoder.forward(wavs_batch, l2_norm=True) # split generated and GT speaker embeddings gt_spk_emb, syn_spk_emb = torch.chunk(pred_embs, 2, dim=0) @@ -1024,6 +1013,10 @@ class Vits(BaseTTS): ): # pylint: disable=unused-argument, redefined-builtin """Load the model checkpoint and setup for training or inference""" state = torch.load(checkpoint_path, map_location=torch.device("cpu")) + # compat band-aid for the pre-trained models to not use the encoder baked into the model + # TODO: consider baking the speaker encoder into the model and call it from there. + # as it is probably easier for model distribution. + state["model"] = {k: v for k, v in state["model"].items() if "speaker_encoder" not in k} self.load_state_dict(state["model"]) if eval: self.eval() From 95ca2ef77356cc17793d02963760d50552aa87b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 16 Dec 2021 14:57:24 +0000 Subject: [PATCH 100/220] Implement init_speaker_encoder and change arg names --- TTS/utils/synthesizer.py | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index bd90dd8c..62540ae2 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -1,5 +1,5 @@ import time -from typing import List +from typing import List, Union import numpy as np import pysbd @@ -117,6 +117,7 @@ class Synthesizer(object): speaker_manager = self._init_speaker_manager() language_manager = self._init_language_manager() + speaker_manager = self._init_speaker_encoder(speaker_manager) if language_manager is not None: self.tts_model = setup_tts_model( @@ -130,23 +131,47 @@ class Synthesizer(object): if use_cuda: self.tts_model.cuda() + def _is_use_speaker_embedding(self): + """Check if the speaker embedding is used in the model""" + # some models use model_args some don't + if hasattr(self.tts_config, "model_args"): + config = self.tts_config.model_args + else: + config = self.tts_config + return hasattr(config, "use_speaker_embedding") and config.use_speaker_embedding is True + + def _is_use_d_vector_file(self): + """Check if the d-vector file is used in the model""" + # some models use model_args some don't + if hasattr(self.tts_config, "model_args"): + config = self.tts_config.model_args + else: + config = self.tts_config + return hasattr(config, "use_d_vector_file") and config.use_d_vector_file is True + def _init_speaker_manager(self): """Initialize the SpeakerManager""" # setup if multi-speaker settings are in the global model config speaker_manager = None - if hasattr(self.tts_config, "use_speaker_embedding") and self.tts_config.use_speaker_embedding is True: + if self._is_use_speaker_embedding(): if self.tts_speakers_file: speaker_manager = SpeakerManager(speaker_id_file_path=self.tts_speakers_file) if self.tts_config.get("speakers_file", None): speaker_manager = SpeakerManager(speaker_id_file_path=self.tts_config.speakers_file) - if hasattr(self.tts_config, "use_d_vector_file") and self.tts_config.use_speaker_embedding is True: + if self._is_use_d_vector_file(): if self.tts_speakers_file: speaker_manager = SpeakerManager(d_vectors_file_path=self.tts_speakers_file) if self.tts_config.get("d_vector_file", None): speaker_manager = SpeakerManager(d_vectors_file_path=self.tts_config.d_vector_file) return speaker_manager + def _init_speaker_encoder(self, speaker_manager): + """Initialize the SpeakerEncoder""" + if self.encoder_checkpoint is not None: + speaker_manager.init_speaker_encoder(self.encoder_checkpoint, self.encoder_config) + return speaker_manager + def _init_language_manager(self): """Initialize the LanguageManager""" # setup if multi-lingual settings are in the global model config @@ -203,7 +228,12 @@ class Synthesizer(object): self.ap.save_wav(wav, path, self.output_sample_rate) def tts( - self, text: str, speaker_name: str = "", language_name: str = "", speaker_wav=None, style_wav=None + self, + text: str, + speaker_name: str = "", + language_name: str = "", + speaker_wav: Union[str, List[str]] = None, + style_wav=None, ) -> List[int]: """🐸 TTS magic. Run all the models and generate speech. @@ -211,7 +241,7 @@ class Synthesizer(object): text (str): input text. speaker_name (str, optional): spekaer id for multi-speaker models. Defaults to "". language_name (str, optional): language id for multi-language models. Defaults to "". - speaker_wav (): + speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None. style_wav ([type], optional): style waveform for GST. Defaults to None. Returns: From 223ffe29549a902d96ff8bcaa490be09f64a5a4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 16 Dec 2021 14:57:54 +0000 Subject: [PATCH 101/220] Add torchaudio to requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 453c3ec4..ddb6def9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,3 +27,4 @@ gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=2.0.0 fsspec>=2021.04.0 pyworld webrtcvad +torchaudio From eb3e8affe1141d16b95d123b7ef119e3f1095d93 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 12 Aug 2021 07:52:44 -0300 Subject: [PATCH 102/220] Save speakers embeddings/ids before starting training --- TTS/tts/models/vits.py | 1 - requirements.txt | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index bc459b7f..d1755b47 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -402,7 +402,6 @@ class Vits(BaseTTS): # speaker embedding if self.num_speakers > 1 and sid is not None: g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] - # posterior encoder z, m_q, logs_q, y_mask = self.posterior_encoder(y, y_lengths, g=g) diff --git a/requirements.txt b/requirements.txt index 3ec33ceb..d21a972f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,7 @@ pyyaml scipy>=0.19.0 soundfile tensorboardX -torch>=1.7 +torch>=1.9.0 tqdm numba==0.53 umap-learn==0.5.1 From e0ad8380665e1391eb70a09564329e46a30cb1ef Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 12 Aug 2021 21:52:12 -0300 Subject: [PATCH 103/220] Select randomly a speaker from the speaker manager for the test setences --- TTS/tts/models/vits.py | 18 ++++++++---------- TTS/tts/utils/speakers.py | 8 ++++++++ 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index d1755b47..ae607c47 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -402,6 +402,7 @@ class Vits(BaseTTS): # speaker embedding if self.num_speakers > 1 and sid is not None: g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] + # posterior encoder z, m_q, logs_q, y_mask = self.posterior_encoder(y, y_lengths, g=g) @@ -638,7 +639,7 @@ class Vits(BaseTTS): return self._log(ap, batch, outputs, "eval") @torch.no_grad() - def test_run(self, ap) -> Tuple[Dict, Dict]: + def test_run(self, ap, eval_loader=None) -> Tuple[Dict, Dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. @@ -650,16 +651,13 @@ class Vits(BaseTTS): test_audios = {} test_figures = {} test_sentences = self.config.test_sentences - aux_inputs = { - "speaker_id": None - if not self.config.use_speaker_embedding - else random.sample(sorted(self.speaker_manager.speaker_ids.values()), 1), - "d_vector": None - if not self.config.use_d_vector_file - else random.samples(sorted(self.speaker_manager.d_vectors.values()), 1), - "style_wav": None, - } + if hasattr(self, "speaker_manager"): + aux_inputs = self.speaker_manager.get_random_speaker_aux_input() + else: + aux_inputs = self.get_aux_input() + for idx, sen in enumerate(test_sentences): + wav, alignment, _, _ = synthesis( self, sen, diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 13696a20..ae001155 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -209,6 +209,14 @@ class SpeakerManager: d_vectors = np.stack(d_vectors[:num_samples]).mean(0) return d_vectors + def get_random_speaker_aux_input(self) -> Dict: + if self.d_vectors: + return {"speaker_id": None, "style_wav": None, "d_vector": self.d_vectors[random.choices(list(self.d_vectors.keys()))[0]]["embedding"]} + elif self.speaker_ids: + return {"speaker_id": self.speaker_ids[random.choices(list(self.speaker_ids.keys()))[0]], "style_wav": None, "d_vector": None} + else: + return {"speaker_id": None, "style_wav": None, "d_vector": None} + def get_speakers(self) -> List: return self.speaker_ids From 6a7db67a9105d59e1b1d79e33ddbee7f283161e9 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 12 Aug 2021 22:48:30 -0300 Subject: [PATCH 104/220] Allow ignore speakers for all multispeaker datasets --- TTS/tts/datasets/__init__.py | 5 ++- TTS/tts/datasets/formatters.py | 79 +++++++++++++++++++++------------- 2 files changed, 53 insertions(+), 31 deletions(-) diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index 4fae974f..741f92fd 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -67,16 +67,17 @@ def load_tts_samples( root_path = dataset["path"] meta_file_train = dataset["meta_file_train"] meta_file_val = dataset["meta_file_val"] + ununsed_speakers = dataset["ununsed_speakers"] # setup the right data processor if formatter is None: formatter = _get_formatter_by_name(name) # load train set - meta_data_train = formatter(root_path, meta_file_train) + meta_data_train = formatter(root_path, meta_file_train, ununsed_speakers=ununsed_speakers) print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}") # load evaluation split if set if eval_split: if meta_file_val: - meta_data_eval = formatter(root_path, meta_file_val) + meta_data_eval = formatter(root_path, meta_file_val, ununsed_speakers=ununsed_speakers) else: meta_data_eval, meta_data_train = split_dataset(meta_data_train) meta_data_eval_all += meta_data_eval diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 425eb0cd..bcbbb369 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -12,7 +12,7 @@ from tqdm import tqdm ######################## -def tweb(root_path, meta_file): +def tweb(root_path, meta_file, **kwargs): """Normalize TWEB dataset. https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset """ @@ -28,7 +28,7 @@ def tweb(root_path, meta_file): return items -def mozilla(root_path, meta_file): +def mozilla(root_path, meta_file, **kwargs): """Normalizes Mozilla meta data files to TTS format""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -43,7 +43,7 @@ def mozilla(root_path, meta_file): return items -def mozilla_de(root_path, meta_file): +def mozilla_de(root_path, meta_file, **kwargs): """Normalizes Mozilla meta data files to TTS format""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -82,6 +82,10 @@ def mailabs(root_path, meta_files=None): if speaker_name_match is None: continue speaker_name = speaker_name_match.group("speaker_name") + # ignore speakers + if isinstance(ununsed_speakers, list): + if speaker_name in ununsed_speakers: + continue print(" | > {}".format(csv_file)) with open(txt_file, "r", encoding="utf-8") as ttf: for line in ttf: @@ -98,7 +102,7 @@ def mailabs(root_path, meta_files=None): return items -def ljspeech(root_path, meta_file): +def ljspeech(root_path, meta_file, **kwargs): """Normalizes the LJSpeech meta data file to TTS format https://keithito.com/LJ-Speech-Dataset/""" txt_file = os.path.join(root_path, meta_file) @@ -113,7 +117,7 @@ def ljspeech(root_path, meta_file): return items -def ljspeech_test(root_path, meta_file): +def ljspeech_test(root_path, meta_file, **kwargs): """Normalizes the LJSpeech meta data file for TTS testing https://keithito.com/LJ-Speech-Dataset/""" txt_file = os.path.join(root_path, meta_file) @@ -127,7 +131,7 @@ def ljspeech_test(root_path, meta_file): return items -def sam_accenture(root_path, meta_file): +def sam_accenture(root_path, meta_file, **kwargs): """Normalizes the sam-accenture meta data file to TTS format https://github.com/Sam-Accenture-Non-Binary-Voice/non-binary-voice-files""" xml_file = os.path.join(root_path, "voice_over_recordings", meta_file) @@ -144,12 +148,12 @@ def sam_accenture(root_path, meta_file): return items -def ruslan(root_path, meta_file): +def ruslan(root_path, meta_file, **kwargs): """Normalizes the RUSLAN meta data file to TTS format https://ruslan-corpus.github.io/""" txt_file = os.path.join(root_path, meta_file) items = [] - speaker_name = "ljspeech" + speaker_name = "ruslan" with open(txt_file, "r", encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") @@ -159,11 +163,11 @@ def ruslan(root_path, meta_file): return items -def css10(root_path, meta_file): +def css10(root_path, meta_file, **kwargs): """Normalizes the CSS10 dataset file to TTS format""" txt_file = os.path.join(root_path, meta_file) items = [] - speaker_name = "ljspeech" + speaker_name = "css10" with open(txt_file, "r", encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") @@ -173,7 +177,7 @@ def css10(root_path, meta_file): return items -def nancy(root_path, meta_file): +def nancy(root_path, meta_file, **kwargs): """Normalizes the Nancy meta data file to TTS format""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -187,7 +191,7 @@ def nancy(root_path, meta_file): return items -def common_voice(root_path, meta_file): +def common_voice(root_path, meta_file, ununsed_speakers=None): """Normalize the common voice meta data file to TTS format.""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -198,12 +202,16 @@ def common_voice(root_path, meta_file): cols = line.split("\t") text = cols[2] speaker_name = cols[0] + # ignore speakers + if isinstance(ununsed_speakers, list): + if speaker_name in ununsed_speakers: + continue wav_file = os.path.join(root_path, "clips", cols[1].replace(".mp3", ".wav")) items.append([text, wav_file, "MCV_" + speaker_name]) return items -def libri_tts(root_path, meta_files=None): +def libri_tts(root_path, meta_files=None, ununsed_speakers=None): """https://ai.google/tools/datasets/libri-tts/""" items = [] if meta_files is None: @@ -222,13 +230,17 @@ def libri_tts(root_path, meta_files=None): _root_path = os.path.join(root_path, f"{speaker_name}/{chapter_id}") wav_file = os.path.join(_root_path, file_name + ".wav") text = cols[2] + # ignore speakers + if isinstance(ununsed_speakers, list): + if speaker_name in ununsed_speakers: + continue items.append([text, wav_file, "LTTS_" + speaker_name]) for item in items: assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}" return items -def custom_turkish(root_path, meta_file): +def custom_turkish(root_path, meta_file, **kwargs): txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "turkish-female" @@ -247,7 +259,7 @@ def custom_turkish(root_path, meta_file): # ToDo: add the dataset link when the dataset is released publicly -def brspeech(root_path, meta_file): +def brspeech(root_path, meta_file, ununsed_speakers=None): """BRSpeech 3.0 beta""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -258,21 +270,25 @@ def brspeech(root_path, meta_file): cols = line.split("|") wav_file = os.path.join(root_path, cols[0]) text = cols[2] - speaker_name = cols[3] - items.append([text, wav_file, speaker_name]) + speaker_id = cols[3] + # ignore speakers + if isinstance(ununsed_speakers, list): + if speaker_id in ununsed_speakers: + continue + items.append([text, wav_file, speaker_id]) return items -def vctk(root_path, meta_files=None, wavs_path="wav48"): +def vctk(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" - test_speakers = meta_files items = [] meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) for meta_file in meta_files: _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) file_id = txt_file.split(".")[0] - if isinstance(test_speakers, list): # if is list ignore this speakers ids - if speaker_id in test_speakers: + # ignore speakers + if isinstance(ununsed_speakers, list): + if speaker_id in ununsed_speakers: continue with open(meta_file, "r", encoding="utf-8") as file_text: text = file_text.readlines()[0] @@ -282,15 +298,16 @@ def vctk(root_path, meta_files=None, wavs_path="wav48"): return items -def vctk_slim(root_path, meta_files=None, wavs_path="wav48"): +def vctk_slim(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" items = [] txt_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) for text_file in txt_files: _, speaker_id, txt_file = os.path.relpath(text_file, root_path).split(os.sep) file_id = txt_file.split(".")[0] - if isinstance(meta_files, list): # if is list ignore this speakers ids - if speaker_id in meta_files: + # ignore speakers + if isinstance(ununsed_speakers, list): + if speaker_id in ununsed_speakers: continue wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav") items.append([None, wav_file, "VCTK_" + speaker_id]) @@ -298,7 +315,7 @@ def vctk_slim(root_path, meta_files=None, wavs_path="wav48"): return items -def mls(root_path, meta_files=None): +def mls(root_path, meta_files=None, ununsed_speakers=None): """http://www.openslr.org/94/""" items = [] with open(os.path.join(root_path, meta_files), "r", encoding="utf-8") as meta: @@ -307,19 +324,23 @@ def mls(root_path, meta_files=None): text = text[:-1] speaker, book, *_ = file.split("_") wav_file = os.path.join(root_path, os.path.dirname(meta_files), "audio", speaker, book, file + ".wav") + # ignore speakers + if isinstance(ununsed_speakers, list): + if speaker in ununsed_speakers: + continue items.append([text, wav_file, "MLS_" + speaker]) return items # ======================================== VOX CELEB =========================================== -def voxceleb2(root_path, meta_file=None): +def voxceleb2(root_path, meta_file=None, **kwargs): """ :param meta_file Used only for consistency with load_tts_samples api """ return _voxcel_x(root_path, meta_file, voxcel_idx="2") -def voxceleb1(root_path, meta_file=None): +def voxceleb1(root_path, meta_file=None, **kwargs): """ :param meta_file Used only for consistency with load_tts_samples api """ @@ -361,7 +382,7 @@ def _voxcel_x(root_path, meta_file, voxcel_idx): return [x.strip().split("|") for x in f.readlines()] -def baker(root_path: str, meta_file: str) -> List[List[str]]: +def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]: """Normalizes the Baker meta data file to TTS format Args: @@ -381,7 +402,7 @@ def baker(root_path: str, meta_file: str) -> List[List[str]]: return items -def kokoro(root_path, meta_file): +def kokoro(root_path, meta_file, **kwargs): """Japanese single-speaker dataset from https://github.com/kaiidams/Kokoro-Speech-Dataset""" txt_file = os.path.join(root_path, meta_file) items = [] From d91c595c5a10c264b92ac855fd60dd27f4c21cdf Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 13 Aug 2021 00:49:38 -0300 Subject: [PATCH 105/220] Implement training support with d_vecs in the VITS model --- TTS/tts/models/vits.py | 9 +++++++-- TTS/tts/utils/speakers.py | 16 +++++++++++----- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index ae607c47..73dfbb2e 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -8,6 +8,7 @@ import torch from coqpit import Coqpit from torch import nn from torch.cuda.amp.autocast_mode import autocast +from torch.nn import functional as F from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor from TTS.tts.layers.vits.discriminator import VitsDiscriminator @@ -138,6 +139,9 @@ class VitsArgs(Coqpit): use_d_vector_file (bool): Enable/Disable the use of d-vectors for multi-speaker training. Defaults to False. + d_vector_file (str): + Path to the file including pre-computed speaker embeddings. Defaults to None. + d_vector_dim (int): Number of d-vector channels. Defaults to 0. @@ -179,6 +183,7 @@ class VitsArgs(Coqpit): use_speaker_embedding: bool = False num_speakers: int = 0 speakers_file: str = None + d_vector_file: str = None speaker_embedding_channels: int = 256 use_d_vector_file: bool = False d_vector_file: str = None @@ -360,7 +365,7 @@ class Vits(BaseTTS): if sid.ndim == 0: sid = sid.unsqueeze_(0) if "d_vectors" in aux_input and aux_input["d_vectors"] is not None: - g = aux_input["d_vectors"] + g = F.normalize(aux_input["d_vectors"]).unsqueeze(-1) return sid, g def get_aux_input(self, aux_input: Dict): @@ -400,7 +405,7 @@ class Vits(BaseTTS): x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths) # speaker embedding - if self.num_speakers > 1 and sid is not None: + if self.num_speakers > 1 and sid is not None and not self.use_d_vector: g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] # posterior encoder diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index ae001155..3d8590cc 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -154,15 +154,21 @@ class SpeakerManager: """ self._save_json(file_path, self.d_vectors) - def set_d_vectors_from_file(self, file_path: str) -> None: + def set_d_vectors_from_file(self, file_path: str, data: List = None) -> None: """Load d_vectors from a json file. Args: file_path (str): Path to the target json file. """ self.d_vectors = self._load_json(file_path) - speakers = sorted({x["name"] for x in self.d_vectors.values()}) - self.speaker_ids = {name: i for i, name in enumerate(speakers)} + + # load speakers from data, because during the training we can just use some speakers from d_vector_file + if data is not None: + self.speaker_ids, _ = self.parse_speakers_from_data(data) + else: + speakers = sorted({x["name"] for x in self.d_vectors.values()}) + self.speaker_ids = {name: i for i, name in enumerate(speakers)} + self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys()))) def get_d_vector_by_clip(self, clip_idx: str) -> List: @@ -357,7 +363,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.d_vector_file" ) speaker_manager.load_d_vectors_file(c.d_vector_file) - speaker_manager.set_d_vectors_from_file(speakers_file) + speaker_manager.set_d_vectors_from_file(speakers_file, data=data) elif not c.use_d_vector_file: # restor speaker manager with speaker ID file. speaker_ids_from_data = speaker_manager.speaker_ids speaker_manager.set_speaker_ids_from_file(speakers_file) @@ -366,7 +372,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, ), " [!] You cannot introduce new speakers to a pre-trained model." elif c.use_d_vector_file and c.d_vector_file: # new speaker manager with external speaker embeddings. - speaker_manager.set_d_vectors_from_file(c.d_vector_file) + speaker_manager.set_d_vectors_from_file(c.d_vector_file, data=data) elif c.use_d_vector_file and not c.d_vector_file: raise "use_d_vector_file is True, so you need pass a external speaker embedding file." elif c.use_speaker_embedding and "speakers_file" in c and c.speakers_file: From 5f1c18187f35eb8648dca8de5f8941d921aeb77b Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 13 Aug 2021 03:38:54 -0300 Subject: [PATCH 106/220] Fix pylint issues --- TTS/tts/datasets/formatters.py | 30 +++++++++++++++--------------- TTS/tts/models/vits.py | 2 +- TTS/tts/utils/speakers.py | 7 ++++--- requirements.txt | 2 +- 4 files changed, 21 insertions(+), 20 deletions(-) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index bcbbb369..51ad892a 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -12,7 +12,7 @@ from tqdm import tqdm ######################## -def tweb(root_path, meta_file, **kwargs): +def tweb(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Normalize TWEB dataset. https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset """ @@ -28,7 +28,7 @@ def tweb(root_path, meta_file, **kwargs): return items -def mozilla(root_path, meta_file, **kwargs): +def mozilla(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Normalizes Mozilla meta data files to TTS format""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -43,7 +43,7 @@ def mozilla(root_path, meta_file, **kwargs): return items -def mozilla_de(root_path, meta_file, **kwargs): +def mozilla_de(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Normalizes Mozilla meta data files to TTS format""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -102,7 +102,7 @@ def mailabs(root_path, meta_files=None): return items -def ljspeech(root_path, meta_file, **kwargs): +def ljspeech(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Normalizes the LJSpeech meta data file to TTS format https://keithito.com/LJ-Speech-Dataset/""" txt_file = os.path.join(root_path, meta_file) @@ -117,7 +117,7 @@ def ljspeech(root_path, meta_file, **kwargs): return items -def ljspeech_test(root_path, meta_file, **kwargs): +def ljspeech_test(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Normalizes the LJSpeech meta data file for TTS testing https://keithito.com/LJ-Speech-Dataset/""" txt_file = os.path.join(root_path, meta_file) @@ -131,7 +131,7 @@ def ljspeech_test(root_path, meta_file, **kwargs): return items -def sam_accenture(root_path, meta_file, **kwargs): +def sam_accenture(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Normalizes the sam-accenture meta data file to TTS format https://github.com/Sam-Accenture-Non-Binary-Voice/non-binary-voice-files""" xml_file = os.path.join(root_path, "voice_over_recordings", meta_file) @@ -148,7 +148,7 @@ def sam_accenture(root_path, meta_file, **kwargs): return items -def ruslan(root_path, meta_file, **kwargs): +def ruslan(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Normalizes the RUSLAN meta data file to TTS format https://ruslan-corpus.github.io/""" txt_file = os.path.join(root_path, meta_file) @@ -163,7 +163,7 @@ def ruslan(root_path, meta_file, **kwargs): return items -def css10(root_path, meta_file, **kwargs): +def css10(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Normalizes the CSS10 dataset file to TTS format""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -177,7 +177,7 @@ def css10(root_path, meta_file, **kwargs): return items -def nancy(root_path, meta_file, **kwargs): +def nancy(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Normalizes the Nancy meta data file to TTS format""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -240,7 +240,7 @@ def libri_tts(root_path, meta_files=None, ununsed_speakers=None): return items -def custom_turkish(root_path, meta_file, **kwargs): +def custom_turkish(root_path, meta_file, **kwargs): # pylint: disable=unused-argument txt_file = os.path.join(root_path, meta_file) items = [] speaker_name = "turkish-female" @@ -298,7 +298,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): return items -def vctk_slim(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): +def vctk_slim(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): # pylint: disable=unused-argument """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" items = [] txt_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) @@ -333,14 +333,14 @@ def mls(root_path, meta_files=None, ununsed_speakers=None): # ======================================== VOX CELEB =========================================== -def voxceleb2(root_path, meta_file=None, **kwargs): +def voxceleb2(root_path, meta_file=None, **kwargs): # pylint: disable=unused-argument """ :param meta_file Used only for consistency with load_tts_samples api """ return _voxcel_x(root_path, meta_file, voxcel_idx="2") -def voxceleb1(root_path, meta_file=None, **kwargs): +def voxceleb1(root_path, meta_file=None, **kwargs): # pylint: disable=unused-argument """ :param meta_file Used only for consistency with load_tts_samples api """ @@ -382,7 +382,7 @@ def _voxcel_x(root_path, meta_file, voxcel_idx): return [x.strip().split("|") for x in f.readlines()] -def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]: +def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]: # pylint: disable=unused-argument """Normalizes the Baker meta data file to TTS format Args: @@ -402,7 +402,7 @@ def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]: return items -def kokoro(root_path, meta_file, **kwargs): +def kokoro(root_path, meta_file, **kwargs): # pylint: disable=unused-argument """Japanese single-speaker dataset from https://github.com/kaiidams/Kokoro-Speech-Dataset""" txt_file = os.path.join(root_path, meta_file) items = [] diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 73dfbb2e..417b6386 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -644,7 +644,7 @@ class Vits(BaseTTS): return self._log(ap, batch, outputs, "eval") @torch.no_grad() - def test_run(self, ap, eval_loader=None) -> Tuple[Dict, Dict]: + def test_run(self, ap) -> Tuple[Dict, Dict]: """Generic test run for `tts` models used by `Trainer`. You can override this for a different behaviour. diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 3d8590cc..5d883fd0 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -218,10 +218,11 @@ class SpeakerManager: def get_random_speaker_aux_input(self) -> Dict: if self.d_vectors: return {"speaker_id": None, "style_wav": None, "d_vector": self.d_vectors[random.choices(list(self.d_vectors.keys()))[0]]["embedding"]} - elif self.speaker_ids: + + if self.speaker_ids: return {"speaker_id": self.speaker_ids[random.choices(list(self.speaker_ids.keys()))[0]], "style_wav": None, "d_vector": None} - else: - return {"speaker_id": None, "style_wav": None, "d_vector": None} + + return {"speaker_id": None, "style_wav": None, "d_vector": None} def get_speakers(self) -> List: return self.speaker_ids diff --git a/requirements.txt b/requirements.txt index d21a972f..3ec33ceb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,7 @@ pyyaml scipy>=0.19.0 soundfile tensorboardX -torch>=1.9.0 +torch>=1.7 tqdm numba==0.53 umap-learn==0.5.1 From f996afedb0b0be325359bbe1ab758cdc05ef44b5 Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 13 Aug 2021 19:58:56 -0300 Subject: [PATCH 107/220] Implement multilingual dataloader support --- TTS/config/shared_configs.py | 3 +++ TTS/trainer.py | 14 ++++++++++++++ TTS/tts/datasets/__init__.py | 6 ++++++ TTS/tts/datasets/dataset.py | 23 +++++++++++++++++++---- TTS/tts/models/base_tts.py | 31 ++++++++++++++++++++++++++----- TTS/tts/utils/text/cleaners.py | 8 ++++++++ 6 files changed, 76 insertions(+), 9 deletions(-) diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index d91bf2b6..f1ea2e0f 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -199,6 +199,7 @@ class BaseDatasetConfig(Coqpit): path: str = "" meta_file_train: str = "" ununsed_speakers: List[str] = None + language: str = "" meta_file_val: str = "" meta_file_attn_mask: str = "" @@ -335,6 +336,8 @@ class BaseTrainingConfig(Coqpit): num_loader_workers: int = 0 num_eval_loader_workers: int = 0 use_noise_augment: bool = False + use_language_weighted_sampler: bool = False + # paths output_path: str = None # distributed diff --git a/TTS/trainer.py b/TTS/trainer.py index 2a2cfc46..2175875c 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -260,6 +260,20 @@ class Trainer: else: self.run_get_model(self.config, get_model) + if hasattr(self.model, "init_multilingual"): + self.model.init_multilingual(self.config, self.data_train + self.data_eval) + config = self.config.model_args if hasattr(self.config, "model_args") else self.config + # save speakers json + if config.use_language_embedding and self.model.language_manager.num_languages > 1: + self.model.language_manager.save_language_ids_to_file(os.path.join(self.output_path, "language_ids.json")) + if hasattr(self.config, "model_args"): + self.config.model_args["num_languages"] = self.model.language_manager.num_languages + else: + self.config.num_languages = self.model.language_manager.num_languages + + # update config file + copy_model_files(self.config, self.output_path, None) + # setup criterion self.criterion = self.get_criterion(self.model) diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index 741f92fd..3673e188 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -68,16 +68,22 @@ def load_tts_samples( meta_file_train = dataset["meta_file_train"] meta_file_val = dataset["meta_file_val"] ununsed_speakers = dataset["ununsed_speakers"] + language = dataset["language"] + # setup the right data processor if formatter is None: formatter = _get_formatter_by_name(name) # load train set meta_data_train = formatter(root_path, meta_file_train, ununsed_speakers=ununsed_speakers) + # TODO: remove the loops and pass language as a parameter to preprocessor for faster load + meta_data_train = [[*item, language] for item in meta_data_train] + print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}") # load evaluation split if set if eval_split: if meta_file_val: meta_data_eval = formatter(root_path, meta_file_val, ununsed_speakers=ununsed_speakers) + meta_data_eval = [[*item, language] for item in meta_data_eval] else: meta_data_eval, meta_data_train = split_dataset(meta_data_train) meta_data_eval_all += meta_data_eval diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 04314bab..7ba97eba 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -37,6 +37,7 @@ class TTSDataset(Dataset): enable_eos_bos: bool = False, speaker_id_mapping: Dict = None, d_vector_mapping: Dict = None, + language_id_mapping: Dict = None, use_noise_augment: bool = False, verbose: bool = False, ): @@ -122,6 +123,7 @@ class TTSDataset(Dataset): self.enable_eos_bos = enable_eos_bos self.speaker_id_mapping = speaker_id_mapping self.d_vector_mapping = d_vector_mapping + self.language_id_mapping = language_id_mapping self.use_noise_augment = use_noise_augment self.verbose = verbose self.input_seq_computed = False @@ -197,10 +199,10 @@ class TTSDataset(Dataset): def load_data(self, idx): item = self.items[idx] - if len(item) == 4: - text, wav_file, speaker_name, attn_file = item + if len(item) == 5: + text, wav_file, speaker_name, language_name, attn_file = item else: - text, wav_file, speaker_name = item + text, wav_file, speaker_name, language_name = item attn = None raw_text = text @@ -218,7 +220,7 @@ class TTSDataset(Dataset): self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, - self.phoneme_language, + language_name if language_name else self.phoneme_language, self.custom_symbols, self.characters, self.add_blank, @@ -260,6 +262,7 @@ class TTSDataset(Dataset): "attn": attn, "item_idx": self.items[idx][1], "speaker_name": speaker_name, + "language_name": language_name, "wav_file_name": os.path.basename(wav_file), } return sample @@ -413,6 +416,14 @@ class TTSDataset(Dataset): # convert list of dicts to dict of lists batch = {k: [dic[k] for dic in batch] for k in batch[0]} + speaker_names = [batch[idx]["speaker_name"] for idx in ids_sorted_decreasing] + + # get language ids from language names + if self.language_id_mapping is not None: + language_names = [batch[idx]["language_name"] for idx in ids_sorted_decreasing] + language_ids = [self.language_id_mapping[ln] for ln in language_names] + else: + language_ids = None # get pre-computed d-vectors if self.d_vector_mapping is not None: wav_files_names = [batch["wav_file_name"][idx] for idx in ids_sorted_decreasing] @@ -466,6 +477,9 @@ class TTSDataset(Dataset): if speaker_ids is not None: speaker_ids = torch.LongTensor(speaker_ids) + if language_ids is not None: + language_ids = torch.LongTensor(language_ids) + # compute linear spectrogram if self.compute_linear_spec: linear = [self.ap.spectrogram(w).astype("float32") for w in batch["wav"]] @@ -528,6 +542,7 @@ class TTSDataset(Dataset): "waveform": wav_padded, "raw_text": batch["raw_text"], "pitch": pitch, + "language_ids": language_ids } raise TypeError( diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 854526de..c55936a8 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -13,6 +13,7 @@ from TTS.model import BaseModel from TTS.tts.configs.shared_configs import CharactersConfig from TTS.tts.datasets.dataset import TTSDataset from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager +from TTS.tts.utils.languages import LanguageManager, get_language_weighted_sampler from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text import make_symbols from TTS.tts.utils.visual import plot_alignment, plot_spectrogram @@ -73,9 +74,18 @@ class BaseTTS(BaseModel): def get_speaker_manager(config: Coqpit, restore_path: str, data: List, out_path: str = None) -> SpeakerManager: return get_speaker_manager(config, restore_path, data, out_path) - def init_multispeaker(self, config: Coqpit): - """Init speaker embedding layer if `use_speaker_embedding` is True and set the expected speaker embedding - vector dimension in the network. If model uses d-vectors, then it only sets the expected dimension. + def init_multispeaker(self, config: Coqpit, data: List = None): + """Initialize a speaker embedding layer if needen and define expected embedding channel size for defining + `in_channels` size of the connected layers. + + This implementation yields 3 possible outcomes: + + 1. If `config.use_speaker_embedding` and `config.use_d_vector_file are False, do nothing. + 2. If `config.use_d_vector_file` is True, set expected embedding channel size to `config.d_vector_dim` or 512. + 3. If `config.use_speaker_embedding`, initialize a speaker embedding layer with channel size of + `config.d_vector_dim` or 512. + + You can override this function for new models. Args: config (Coqpit): Model configuration. @@ -122,6 +132,7 @@ class BaseTTS(BaseModel): attn_mask = batch["attns"] waveform = batch["waveform"] pitch = batch["pitch"] + language_ids = batch["language_ids"] max_text_length = torch.max(text_lengths.float()) max_spec_length = torch.max(mel_lengths.float()) @@ -169,6 +180,7 @@ class BaseTTS(BaseModel): "item_idx": item_idx, "waveform": waveform, "pitch": pitch, + "language_ids": language_ids, } def get_data_loader( @@ -199,7 +211,12 @@ class BaseTTS(BaseModel): if hasattr(self, "make_symbols"): custom_symbols = self.make_symbols(self.config) - # init dataset + if hasattr(self, "language_manager"): + language_id_mapping = self.language_manager.language_id_mapping if self.args.use_language_embedding else None + else: + language_id_mapping = None + + # init dataloader dataset = TTSDataset( outputs_per_step=config.r if "r" in config else 1, text_cleaner=config.text_cleaner, @@ -223,6 +240,7 @@ class BaseTTS(BaseModel): verbose=verbose, speaker_id_mapping=speaker_id_mapping, d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None, + language_id_mapping=language_id_mapping, ) # pre-compute phonemes @@ -267,8 +285,11 @@ class BaseTTS(BaseModel): # sampler for DDP sampler = DistributedSampler(dataset) if num_gpus > 1 else None + if sampler is None: + if getattr(config, "use_language_weighted_sampler", False): + sampler = get_language_weighted_sampler(dataset.items) + print(" > Using Language weighted sampler") - # init dataloader loader = DataLoader( dataset, batch_size=config.eval_batch_size if is_eval else config.batch_size, diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 4b041ed8..71155ebc 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -135,3 +135,11 @@ def phoneme_cleaners(text): text = remove_aux_symbols(text) text = collapse_whitespace(text) return text + +def multilingual_cleaners(text): + '''Pipeline for multilingual text''' + text = lowercase(text) + text = replace_symbols(text, lang=None) + text = remove_aux_symbols(text) + text = collapse_whitespace(text) + return text \ No newline at end of file From dcb2374bc99e274c6ac3e3c541bd1a776d06423a Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 13 Aug 2021 21:40:34 -0300 Subject: [PATCH 108/220] Add multilingual training support to the VITS model --- TTS/tts/layers/glow_tts/duration_predictor.py | 11 +- TTS/tts/layers/vits/networks.py | 12 +- .../vits/stochastic_duration_predictor.py | 11 +- TTS/tts/models/base_tts.py | 3 +- TTS/tts/models/vits.py | 87 +++++++++-- TTS/tts/utils/languages.py | 138 ++++++++++++++++++ TTS/tts/utils/speakers.py | 11 +- 7 files changed, 248 insertions(+), 25 deletions(-) create mode 100644 TTS/tts/utils/languages.py diff --git a/TTS/tts/layers/glow_tts/duration_predictor.py b/TTS/tts/layers/glow_tts/duration_predictor.py index 2c0303be..f46c73a9 100644 --- a/TTS/tts/layers/glow_tts/duration_predictor.py +++ b/TTS/tts/layers/glow_tts/duration_predictor.py @@ -18,7 +18,7 @@ class DurationPredictor(nn.Module): dropout_p (float): Dropout rate used after each conv layer. """ - def __init__(self, in_channels, hidden_channels, kernel_size, dropout_p, cond_channels=None): + def __init__(self, in_channels, hidden_channels, kernel_size, dropout_p, cond_channels=None, language_emb_dim=None): super().__init__() # class arguments self.in_channels = in_channels @@ -36,7 +36,10 @@ class DurationPredictor(nn.Module): if cond_channels is not None and cond_channels != 0: self.cond = nn.Conv1d(cond_channels, in_channels, 1) - def forward(self, x, x_mask, g=None): + if language_emb_dim != 0 and language_emb_dim is not None: + self.cond_lang = nn.Conv1d(language_emb_dim, in_channels, 1) + + def forward(self, x, x_mask, g=None, lang_emb=None): """ Shapes: - x: :math:`[B, C, T]` @@ -45,6 +48,10 @@ class DurationPredictor(nn.Module): """ if g is not None: x = x + self.cond(g) + + if lang_emb is not None: + x = x + self.cond_lang(lang_emb) + x = self.conv_1(x * x_mask) x = torch.relu(x) x = self.norm_1(x) diff --git a/TTS/tts/layers/vits/networks.py b/TTS/tts/layers/vits/networks.py index cfc8b6ac..ef426ace 100644 --- a/TTS/tts/layers/vits/networks.py +++ b/TTS/tts/layers/vits/networks.py @@ -37,6 +37,7 @@ class TextEncoder(nn.Module): num_layers: int, kernel_size: int, dropout_p: float, + language_emb_dim: int = None, ): """Text Encoder for VITS model. @@ -55,8 +56,12 @@ class TextEncoder(nn.Module): self.hidden_channels = hidden_channels self.emb = nn.Embedding(n_vocab, hidden_channels) + nn.init.normal_(self.emb.weight, 0.0, hidden_channels ** -0.5) + if language_emb_dim: + hidden_channels += language_emb_dim + self.encoder = RelativePositionTransformer( in_channels=hidden_channels, out_channels=hidden_channels, @@ -72,13 +77,18 @@ class TextEncoder(nn.Module): self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - def forward(self, x, x_lengths): + def forward(self, x, x_lengths, lang_emb=None): """ Shapes: - x: :math:`[B, T]` - x_length: :math:`[B]` """ x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h] + + # concat the lang emb in embedding chars + if lang_emb is not None: + x = torch.cat((x, lang_emb.transpose(2, 1).expand(x.size(0), x.size(1), -1)), dim=-1) + x = torch.transpose(x, 1, -1) # [b, h, t] x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) diff --git a/TTS/tts/layers/vits/stochastic_duration_predictor.py b/TTS/tts/layers/vits/stochastic_duration_predictor.py index 91e53da3..8ec7c866 100644 --- a/TTS/tts/layers/vits/stochastic_duration_predictor.py +++ b/TTS/tts/layers/vits/stochastic_duration_predictor.py @@ -178,7 +178,7 @@ class StochasticDurationPredictor(nn.Module): """ def __init__( - self, in_channels: int, hidden_channels: int, kernel_size: int, dropout_p: float, num_flows=4, cond_channels=0 + self, in_channels: int, hidden_channels: int, kernel_size: int, dropout_p: float, num_flows=4, cond_channels=0, language_emb_dim=None ): super().__init__() @@ -205,7 +205,10 @@ class StochasticDurationPredictor(nn.Module): if cond_channels != 0 and cond_channels is not None: self.cond = nn.Conv1d(cond_channels, hidden_channels, 1) - def forward(self, x, x_mask, dr=None, g=None, reverse=False, noise_scale=1.0): + if language_emb_dim != 0 and language_emb_dim is not None: + self.cond_lang = nn.Conv1d(language_emb_dim, hidden_channels, 1) + + def forward(self, x, x_mask, dr=None, g=None, lang_emb=None, reverse=False, noise_scale=1.0): """ Shapes: - x: :math:`[B, C, T]` @@ -217,6 +220,10 @@ class StochasticDurationPredictor(nn.Module): x = self.pre(x) if g is not None: x = x + self.cond(g) + + if lang_emb is not None: + x = x + self.cond_lang(lang_emb) + x = self.convs(x, x_mask) x = self.proj(x) * x_mask diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index c55936a8..c0d2bd78 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -287,8 +287,9 @@ class BaseTTS(BaseModel): sampler = DistributedSampler(dataset) if num_gpus > 1 else None if sampler is None: if getattr(config, "use_language_weighted_sampler", False): - sampler = get_language_weighted_sampler(dataset.items) print(" > Using Language weighted sampler") + sampler = get_language_weighted_sampler(dataset.items) + loader = DataLoader( dataset, diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 417b6386..3a682ce5 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -17,6 +17,7 @@ from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDuration from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.languages import LanguageManager from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.visual import plot_alignment from TTS.utils.trainer_utils import get_optimizer, get_scheduler @@ -189,6 +190,9 @@ class VitsArgs(Coqpit): d_vector_file: str = None d_vector_dim: int = 0 detach_dp_input: bool = True + use_language_embedding: bool = False + embedded_language_dim: int = 4 + num_languages: int = 0 class Vits(BaseTTS): @@ -247,6 +251,7 @@ class Vits(BaseTTS): self.args = args self.init_multispeaker(config) + self.init_multilingual(config) self.length_scale = args.length_scale self.noise_scale = args.noise_scale @@ -265,6 +270,7 @@ class Vits(BaseTTS): args.num_layers_text_encoder, args.kernel_size_text_encoder, args.dropout_p_text_encoder, + language_emb_dim=self.embedded_language_dim ) self.posterior_encoder = PosteriorEncoder( @@ -288,16 +294,22 @@ class Vits(BaseTTS): if args.use_sdp: self.duration_predictor = StochasticDurationPredictor( - args.hidden_channels, + args.hidden_channels + self.embedded_language_dim, 192, 3, args.dropout_p_duration_predictor, 4, cond_channels=self.embedded_speaker_dim, + language_emb_dim=self.embedded_language_dim, ) else: self.duration_predictor = DurationPredictor( - args.hidden_channels, 256, 3, args.dropout_p_duration_predictor, cond_channels=self.embedded_speaker_dim + args.hidden_channels + self.embedded_language_dim, + 256, + 3, + args.dropout_p_duration_predictor, + cond_channels=self.embedded_speaker_dim, + language_emb_dim=self.embedded_language_dim, ) self.waveform_decoder = HifiganGenerator( @@ -356,17 +368,40 @@ class Vits(BaseTTS): self.speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file) self.embedded_speaker_dim = config.d_vector_dim + def init_multilingual(self, config: Coqpit, data: List = None): + """Initialize multilingual modules of a model. + + Args: + config (Coqpit): Model configuration. + data (List, optional): Dataset items to infer number of speakers. Defaults to None. + """ + if hasattr(config, "model_args"): + config = config.model_args + # init language manager + self.language_manager = LanguageManager(config, data=data) + + # init language embedding layer + if config.use_language_embedding: + self.embedded_language_dim = config.embedded_language_dim + self.emb_l = nn.Embedding(self.language_manager.num_languages, self.embedded_language_dim) + torch.nn.init.xavier_uniform_(self.emb_l.weight) + else: + self.embedded_language_dim = 0 + self.emb_l = None + @staticmethod def _set_cond_input(aux_input: Dict): """Set the speaker conditioning input based on the multi-speaker mode.""" - sid, g = None, None + sid, g, lid = None, None, None if "speaker_ids" in aux_input and aux_input["speaker_ids"] is not None: sid = aux_input["speaker_ids"] if sid.ndim == 0: sid = sid.unsqueeze_(0) if "d_vectors" in aux_input and aux_input["d_vectors"] is not None: g = F.normalize(aux_input["d_vectors"]).unsqueeze(-1) - return sid, g + if "language_ids" in aux_input and aux_input["language_ids"] is not None: + lid = aux_input["language_ids"] + return sid, g, lid def get_aux_input(self, aux_input: Dict): sid, g = self._set_cond_input(aux_input) @@ -378,7 +413,7 @@ class Vits(BaseTTS): x_lengths: torch.tensor, y: torch.tensor, y_lengths: torch.tensor, - aux_input={"d_vectors": None, "speaker_ids": None}, + aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}, ) -> Dict: """Forward pass of the model. @@ -401,13 +436,19 @@ class Vits(BaseTTS): - speaker_ids: :math:`[B]` """ outputs = {} - sid, g = self._set_cond_input(aux_input) - x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths) + sid, g, lid = self._set_cond_input(aux_input) # speaker embedding if self.num_speakers > 1 and sid is not None and not self.use_d_vector: g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] + # language embedding + if self.args.use_language_embedding: + lang_emb = self.emb_l(lid).unsqueeze(-1) + + x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb) + + # posterior encoder z, m_q, logs_q, y_mask = self.posterior_encoder(y, y_lengths, g=g) @@ -433,6 +474,7 @@ class Vits(BaseTTS): x_mask, attn_durations, g=g.detach() if self.args.detach_dp_input and g is not None else g, + lang_emb=lang_emb.detach() if self.args.detach_dp_input and lang_emb is not None else lang_emb, ) loss_duration = loss_duration / torch.sum(x_mask) else: @@ -441,6 +483,7 @@ class Vits(BaseTTS): x.detach() if self.args.detach_dp_input else x, x_mask, g=g.detach() if self.args.detach_dp_input and g is not None else g, + lang_emb=lang_emb.detach() if self.args.detach_dp_input and lang_emb is not None else lang_emb, ) loss_duration = torch.sum((log_durations - attn_log_durations) ** 2, [1, 2]) / torch.sum(x_mask) outputs["loss_duration"] = loss_duration @@ -467,25 +510,30 @@ class Vits(BaseTTS): ) return outputs - def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None}): + def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}): """ Shapes: - x: :math:`[B, T_seq]` - d_vectors: :math:`[B, C, 1]` - speaker_ids: :math:`[B]` """ - sid, g = self._set_cond_input(aux_input) + sid, g, lid = self._set_cond_input(aux_input) x_lengths = torch.tensor(x.shape[1:2]).to(x.device) - x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths) - - if self.num_speakers > 0 and sid is not None: + # speaker embedding + if self.num_speakers > 0 and sid: g = self.emb_g(sid).unsqueeze(-1) + # language embedding + if self.args.use_language_embedding: + lang_emb = self.emb_l(lid).unsqueeze(-1) + + x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb) + if self.args.use_sdp: - logw = self.duration_predictor(x, x_mask, g=g, reverse=True, noise_scale=self.inference_noise_scale_dp) + logw = self.duration_predictor(x, x_mask, g=g, reverse=True, noise_scale=self.inference_noise_scale_dp, lang_emb=lang_emb) else: - logw = self.duration_predictor(x, x_mask, g=g) + logw = self.duration_predictor(x, x_mask, g=g, lang_emb=lang_emb) w = torch.exp(logw) * x_mask * self.length_scale w_ceil = torch.ceil(w) @@ -537,6 +585,7 @@ class Vits(BaseTTS): linear_input = batch["linear_input"] d_vectors = batch["d_vectors"] speaker_ids = batch["speaker_ids"] + language_ids = batch["language_ids"] waveform = batch["waveform"] # generator pass @@ -545,7 +594,7 @@ class Vits(BaseTTS): text_lengths, linear_input.transpose(1, 2), mel_lengths, - aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids}, + aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids}, ) # cache tensors for the discriminator @@ -581,6 +630,14 @@ class Vits(BaseTTS): loss_duration=outputs["loss_duration"], ) + # handle the duration loss + if self.args.use_sdp: + loss_dict["nll_duration"] = outputs["nll_duration"] + loss_dict["loss"] += outputs["nll_duration"] + else: + loss_dict["loss_duration"] = outputs["loss_duration"] + loss_dict["loss"] += outputs["loss_duration"] + elif optimizer_idx == 1: # discriminator pass outputs = {} diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py new file mode 100644 index 00000000..b87b9936 --- /dev/null +++ b/TTS/tts/utils/languages.py @@ -0,0 +1,138 @@ +import os +import json +import torch +import fsspec +import numpy as np +from typing import Dict, Tuple, List +from coqpit import Coqpit + +from torch.utils.data.sampler import WeightedRandomSampler + +class LanguageManager: + """Manage the languages for multi-lingual 🐸TTS models. Load a datafile and parse the information + in a way that can be queried by language. + + Args: + language_id_file_path (str, optional): Path to the metafile that maps language names to ids used by + TTS models. Defaults to "". + + Examples: + >>> manager = LanguageManager(language_id_file_path=language_id_file_path) + >>> language_id_mapper = manager.language_ids + """ + num_languages: int = 0 + language_id_mapping: Dict = {} + def __init__( + self, + language_id_file_path: str = "", + ): + if language_id_file_path: + self.set_language_ids_from_file(language_id_file_path) + + @staticmethod + def _load_json(json_file_path: str) -> Dict: + with fsspec.open(json_file_path, "r") as f: + return json.load(f) + + @staticmethod + def _save_json(json_file_path: str, data: dict) -> None: + with fsspec.open(json_file_path, "w") as f: + json.dump(data, f, indent=4) + + @property + def num_languages(self) -> int: + return len(list(self.language_id_mapping.keys())) + + @property + def language_names(self) -> List: + return list(self.language_id_mapping.keys()) + + @staticmethod + def parse_languages_from_data(items: list) -> Tuple[Dict, int]: + """Parse language IDs from data samples retured by `load_meta_data()`. + + Args: + items (list): Data sampled returned by `load_meta_data()`. + + Returns: + Tuple[Dict, int]: language IDs and number of languages. + """ + languages = sorted({item[3] for item in items}) + language_ids = {name: i for i, name in enumerate(languages)} + num_languages = len(language_ids) + return language_ids, num_languages + + def set_language_ids_from_data(self, items: List) -> None: + """Set language IDs from data samples. + + Args: + items (List): Data sampled returned by `load_meta_data()`. + """ + self.language_id_mapping, _ = self.parse_languages_from_data(items) + + def set_language_ids_from_file(self, file_path: str) -> None: + """Load language ids from a json file. + + Args: + file_path (str): Path to the target json file. + """ + self.language_id_mapping = self._load_json(file_path) + self.num_languages = len(self.language_id_mapping) + + def save_language_ids_to_file(self, file_path: str) -> None: + """Save language IDs to a json file. + + Args: + file_path (str): Path to the output file. + """ + self._save_json(file_path, self.language_id_mapping) + +def _set_file_path(path): + """Find the language_ids.json under the given path or the above it. + Intended to band aid the different paths returned in restored and continued training.""" + path_restore = os.path.join(os.path.dirname(path), "language_ids.json") + path_continue = os.path.join(path, "language_ids.json") + fs = fsspec.get_mapper(path).fs + if fs.exists(path_restore): + return path_restore + if fs.exists(path_continue): + return path_continue + return None + +def get_language_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path: str = None) -> LanguageManager: + """Initiate a `LanguageManager` instance by the provided config. + + Args: + c (Coqpit): Model configuration. + restore_path (str): Path to a previous training folder. + data (List): Data sampled returned by `load_meta_data()`. Defaults to None. + out_path (str, optional): Save the generated language IDs to a output path. Defaults to None. + + Returns: + SpeakerManager: initialized and ready to use instance. + """ + language_manager = LanguageManager() + if c.use_language_embedding: + if data is not None: + language_manager.set_language_ids_from_data(data) + if restore_path: + language_file = _set_file_path(restore_path) + # restoring language manager from a previous run. + if language_file: + language_manager.set_language_ids_from_file(language_file) + if language_manager.num_languages > 0: + print( + " > Language manager is loaded with {} languages: {}".format( + language_manager.num_languages, ", ".join(language_manager.language_names) + ) + ) + return language_manager + +def get_language_weighted_sampler(items: list): + language_names = np.array([item[3] for item in items]) + unique_language_names = np.unique(language_names).tolist() + language_ids = [unique_language_names.index(l) for l in language_names] + language_count = np.array([len(np.where(language_names == l)[0]) for l in unique_language_names]) + weight_language = 1. / language_count + dataset_samples_weight = torch.from_numpy(np.array([weight_language[l] for l in language_ids])).double() + return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight)) \ No newline at end of file diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 5d883fd0..b7dd5251 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -379,11 +379,14 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, elif c.use_speaker_embedding and "speakers_file" in c and c.speakers_file: # new speaker manager with speaker IDs file. speaker_manager.set_speaker_ids_from_file(c.speakers_file) - print( - " > Speaker manager is loaded with {} speakers: {}".format( - speaker_manager.num_speakers, ", ".join(speaker_manager.speaker_ids) + + if speaker_manager.num_speakers > 0: + print( + " > Speaker manager is loaded with {} speakers: {}".format( + speaker_manager.num_speakers, ", ".join(speaker_manager.speaker_ids) + ) ) - ) + # save file if path is defined if out_path: out_file_path = os.path.join(out_path, "speakers.json") From ac9416fb86b568f89a198fd7a2060051b3e198c5 Mon Sep 17 00:00:00 2001 From: Edresson Date: Sat, 14 Aug 2021 17:52:00 -0300 Subject: [PATCH 109/220] Add multilingual inference support --- TTS/tts/configs/vits_config.py | 12 +++---- TTS/tts/models/base_tts.py | 45 +++++++++++++++++++++++++++ TTS/tts/models/vits.py | 57 +++++++++++++++++++--------------- TTS/tts/utils/speakers.py | 41 +++++++++++++++++++++--- TTS/tts/utils/synthesis.py | 22 ++++++++----- 5 files changed, 133 insertions(+), 44 deletions(-) diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index d490e6e6..3e031f02 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -130,13 +130,13 @@ class VitsConfig(BaseTTSConfig): add_blank: bool = True # testing - test_sentences: List[str] = field( + test_sentences: List[List] = field( default_factory=lambda: [ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963.", + ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."], + ["Be a voice, not an echo."], + ["I'm sorry Dave. I'm afraid I can't do that."], + ["This cake is great. It's so delicious and moist."], + ["Prior to November 22, 1963."], ] ) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index c0d2bd78..bfa6df14 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -107,6 +107,51 @@ class BaseTTS(BaseModel): self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) + def get_aux_input(self, **kwargs) -> Dict: + """Prepare and return `aux_input` used by `forward()`""" + return {"speaker_id": None, "style_wav": None, "d_vector": None, "language_id": None} + + def get_aux_input_from_test_setences(self, sentence_info): + if hasattr(self.config, "model_args"): + config = self.config.model_args + else: + config = self.config + + # extract speaker and language info + text, speaker_name, style_wav, language_name = None, None, None, None + + if isinstance(sentence_info, list): + if len(sentence_info) == 1: + text = sentence_info[0] + elif len(sentence_info) == 2: + text, speaker_name = sentence_info + elif len(sentence_info) == 3: + text, speaker_name, style_wav = sentence_info + elif len(sentence_info) == 4: + text, speaker_name, style_wav, language_name = sentence_info + else: + text = sentence_info + + # get speaker id/d_vector + speaker_id, d_vector, language_id = None, None, None + if hasattr(self, "speaker_manager") and config.use_speaker_embedding: + if config.use_d_vector_file: + if speaker_name is None: + d_vector = self.speaker_manager.get_random_d_vector() + else: + d_vector = self.speaker_manager.get_d_vector_by_speaker(speaker_name) + else: + if speaker_name is None: + speaker_id = self.speaker_manager.get_random_speaker_id() + else: + speaker_id = self.speaker_manager.speaker_ids[speaker_name] + + # get language id + if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None: + language_id = self.language_manager.language_id_mapping[language_name] + + return {"text": text, "speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector, "language_id": language_id} + def format_batch(self, batch: Dict) -> Dict: """Generic batch formatting for `TTSDataset`. diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 3a682ce5..11f1fab0 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -399,8 +399,14 @@ class Vits(BaseTTS): sid = sid.unsqueeze_(0) if "d_vectors" in aux_input and aux_input["d_vectors"] is not None: g = F.normalize(aux_input["d_vectors"]).unsqueeze(-1) + if g.ndim == 2: + g = g.unsqueeze_(0) + if "language_ids" in aux_input and aux_input["language_ids"] is not None: lid = aux_input["language_ids"] + if lid.ndim == 0: + lid = lid.unsqueeze_(0) + return sid, g, lid def get_aux_input(self, aux_input: Dict): @@ -437,9 +443,8 @@ class Vits(BaseTTS): """ outputs = {} sid, g, lid = self._set_cond_input(aux_input) - # speaker embedding - if self.num_speakers > 1 and sid is not None and not self.use_d_vector: + if self.args.use_speaker_embedding and sid is not None and not self.use_d_vector: g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] # language embedding @@ -521,11 +526,11 @@ class Vits(BaseTTS): x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # speaker embedding - if self.num_speakers > 0 and sid: + if self.args.use_speaker_embedding and sid is not None and not self.use_d_vector: g = self.emb_g(sid).unsqueeze(-1) # language embedding - if self.args.use_language_embedding: + if self.args.use_language_embedding and lid is not None: lang_emb = self.emb_l(lid).unsqueeze(-1) x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb) @@ -713,29 +718,29 @@ class Vits(BaseTTS): test_audios = {} test_figures = {} test_sentences = self.config.test_sentences - if hasattr(self, "speaker_manager"): - aux_inputs = self.speaker_manager.get_random_speaker_aux_input() - else: - aux_inputs = self.get_aux_input() - for idx, sen in enumerate(test_sentences): + for idx, s_info in enumerate(test_sentences): + try: + aux_inputs = self.get_aux_input_from_test_setences(s_info) + wav, alignment, _, _ = synthesis( + self, + aux_inputs["text"], + self.config, + "cuda" in str(next(self.parameters()).device), + ap, + speaker_id=aux_inputs["speaker_id"], + d_vector=aux_inputs["d_vector"], + style_wav=aux_inputs["style_wav"], + language_id=aux_inputs["language_id"], + enable_eos_bos_chars=self.config.enable_eos_bos_chars, + use_griffin_lim=True, + do_trim_silence=False, + ).values() - wav, alignment, _, _ = synthesis( - self, - sen, - self.config, - "cuda" in str(next(self.parameters()).device), - ap, - speaker_id=aux_inputs["speaker_id"], - d_vector=aux_inputs["d_vector"], - style_wav=aux_inputs["style_wav"], - enable_eos_bos_chars=self.config.enable_eos_bos_chars, - use_griffin_lim=True, - do_trim_silence=False, - ).values() - - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False) + test_audios["{}-audio".format(idx)] = wav + test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False) + except: # pylint: disable=bare-except + print(" !! Error creating Test Sentence -", idx) return test_figures, test_audios def get_optimizer(self) -> List: @@ -832,3 +837,5 @@ class Vits(BaseTTS): if eval: self.eval() assert not self.training + + diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index b7dd5251..1497ca74 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -193,6 +193,20 @@ class SpeakerManager: """ return [x["embedding"] for x in self.d_vectors.values() if x["name"] == speaker_idx] + def get_d_vector_by_speaker(self, speaker_idx: str) -> np.ndarray: + """Get a d_vector of a speaker. + + Args: + speaker_idx (str): Target speaker ID. + + Returns: + np.ndarray: d_vector. + """ + for x in self.d_vectors.values(): + if x["name"] == speaker_idx: + return x["embedding"] + return None + def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: """Get mean d_vector of a speaker ID. @@ -215,14 +229,31 @@ class SpeakerManager: d_vectors = np.stack(d_vectors[:num_samples]).mean(0) return d_vectors - def get_random_speaker_aux_input(self) -> Dict: - if self.d_vectors: - return {"speaker_id": None, "style_wav": None, "d_vector": self.d_vectors[random.choices(list(self.d_vectors.keys()))[0]]["embedding"]} + def get_random_speaker_id(self) -> Any: + """Get a random d_vector. + Args: + + Returns: + np.ndarray: d_vector. + """ if self.speaker_ids: - return {"speaker_id": self.speaker_ids[random.choices(list(self.speaker_ids.keys()))[0]], "style_wav": None, "d_vector": None} + return self.speaker_ids[random.choices(list(self.speaker_ids.keys()))[0]] - return {"speaker_id": None, "style_wav": None, "d_vector": None} + return None + + def get_random_d_vector(self) -> Any: + """Get a random D ID. + + Args: + + Returns: + np.ndarray: d_vector. + """ + if self.d_vectors: + return self.d_vectors[random.choices(list(self.d_vectors.keys()))[0]]["embedding"] + + return None def get_speakers(self) -> List: return self.speaker_ids diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 578c26c0..63fe92c3 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -71,6 +71,7 @@ def run_model_torch( speaker_id: int = None, style_mel: torch.Tensor = None, d_vector: torch.Tensor = None, + language_id: torch.Tensor = None, ) -> Dict: """Run a torch model for inference. It does not support batch inference. @@ -96,6 +97,7 @@ def run_model_torch( "speaker_ids": speaker_id, "d_vectors": d_vector, "style_mel": style_mel, + "language_ids": language_id, }, ) return outputs @@ -160,13 +162,13 @@ def inv_spectrogram(postnet_output, ap, CONFIG): return wav -def speaker_id_to_torch(speaker_id, cuda=False): - if speaker_id is not None: - speaker_id = np.asarray(speaker_id) - speaker_id = torch.from_numpy(speaker_id) +def id_to_torch(aux_id, cuda=False): + if aux_id is not None: + aux_id = np.asarray(aux_id) + aux_id = torch.from_numpy(aux_id) if cuda: - return speaker_id.cuda() - return speaker_id + return aux_id.cuda() + return aux_id def embedding_to_torch(d_vector, cuda=False): @@ -208,6 +210,7 @@ def synthesis( use_griffin_lim=False, do_trim_silence=False, d_vector=None, + language_id=None, backend="torch", ): """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to @@ -262,11 +265,14 @@ def synthesis( # pass tensors to backend if backend == "torch": if speaker_id is not None: - speaker_id = speaker_id_to_torch(speaker_id, cuda=use_cuda) + speaker_id = id_to_torch(speaker_id, cuda=use_cuda) if d_vector is not None: d_vector = embedding_to_torch(d_vector, cuda=use_cuda) + if language_id is not None: + language_id = id_to_torch(language_id, cuda=use_cuda) + if not isinstance(style_mel, dict): style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=use_cuda) @@ -278,7 +284,7 @@ def synthesis( text_inputs = tf.expand_dims(text_inputs, 0) # synthesize voice if backend == "torch": - outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, d_vector=d_vector) + outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, d_vector=d_vector, language_id=language_id) model_outputs = outputs["model_outputs"] model_outputs = model_outputs[0].data.cpu().numpy() alignments = outputs["alignments"] From 3fbbebd74d1d750e56c170407fe5c3b3601a0636 Mon Sep 17 00:00:00 2001 From: Edresson Date: Sun, 15 Aug 2021 13:44:58 -0300 Subject: [PATCH 110/220] Fix pylint issues --- TTS/tts/configs/vits_config.py | 4 ++-- TTS/tts/models/vits.py | 4 +--- TTS/tts/utils/languages.py | 6 ++---- TTS/tts/utils/text/cleaners.py | 2 +- tests/data_tests/test_loader.py | 5 +++++ 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index 3e031f02..cc3e4940 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -82,8 +82,8 @@ class VitsConfig(BaseTTSConfig): add_blank (bool): If true, a blank token is added in between every character. Defaults to `True`. - test_sentences (List[str]): - List of sentences to be used for testing. + test_sentences (List[List]): + List of sentences with speaker and language information to be used for testing. Note: Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters. diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 11f1fab0..6fe60fa0 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -740,7 +740,7 @@ class Vits(BaseTTS): test_audios["{}-audio".format(idx)] = wav test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False) except: # pylint: disable=bare-except - print(" !! Error creating Test Sentence -", idx) + print(" !! Error creating Test Sentence -", idx) return test_figures, test_audios def get_optimizer(self) -> List: @@ -837,5 +837,3 @@ class Vits(BaseTTS): if eval: self.eval() assert not self.training - - diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py index b87b9936..94be914c 100644 --- a/TTS/tts/utils/languages.py +++ b/TTS/tts/utils/languages.py @@ -20,7 +20,6 @@ class LanguageManager: >>> manager = LanguageManager(language_id_file_path=language_id_file_path) >>> language_id_mapper = manager.language_ids """ - num_languages: int = 0 language_id_mapping: Dict = {} def __init__( self, @@ -77,7 +76,6 @@ class LanguageManager: file_path (str): Path to the target json file. """ self.language_id_mapping = self._load_json(file_path) - self.num_languages = len(self.language_id_mapping) def save_language_ids_to_file(self, file_path: str) -> None: """Save language IDs to a json file. @@ -99,7 +97,7 @@ def _set_file_path(path): return path_continue return None -def get_language_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path: str = None) -> LanguageManager: +def get_language_manager(c: Coqpit, data: List = None, restore_path: str = None) -> LanguageManager: """Initiate a `LanguageManager` instance by the provided config. Args: @@ -135,4 +133,4 @@ def get_language_weighted_sampler(items: list): language_count = np.array([len(np.where(language_names == l)[0]) for l in unique_language_names]) weight_language = 1. / language_count dataset_samples_weight = torch.from_numpy(np.array([weight_language[l] for l in language_ids])).double() - return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight)) \ No newline at end of file + return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight)) diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 71155ebc..826919c2 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -142,4 +142,4 @@ def multilingual_cleaners(text): text = replace_symbols(text, lang=None) text = remove_aux_symbols(text) text = collapse_whitespace(text) - return text \ No newline at end of file + return text diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index 8a20c261..19c2e8f7 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -38,6 +38,11 @@ class TestTTSDataset(unittest.TestCase): def _create_dataloader(self, batch_size, r, bgs): items = ljspeech(c.data_path, "metadata.csv") + + # add a default language because now the TTSDataset expect a language + language = "" + items = [[*item, language] for item in items] + dataset = TTSDataset( r, c.text_cleaner, From 7c0b8ec572c9690be8b68cad19a2a96d260b2791 Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 17 Aug 2021 07:41:21 -0300 Subject: [PATCH 111/220] Fix bugs in the non-multilingual VITS inference --- TTS/tts/models/vits.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 6fe60fa0..f6442800 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -448,7 +448,8 @@ class Vits(BaseTTS): g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] # language embedding - if self.args.use_language_embedding: + lang_emb=None + if self.args.use_language_embedding and lid is not None: lang_emb = self.emb_l(lid).unsqueeze(-1) x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb) @@ -530,6 +531,7 @@ class Vits(BaseTTS): g = self.emb_g(sid).unsqueeze(-1) # language embedding + lang_emb=None if self.args.use_language_embedding and lid is not None: lang_emb = self.emb_l(lid).unsqueeze(-1) From e997889ba8ec4606f3977dedfd2b3a4360e99964 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 19 Aug 2021 09:59:41 -0300 Subject: [PATCH 112/220] Fix bug in VITS multilingual inference --- TTS/tts/models/vits.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index f6442800..01eb1874 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -382,8 +382,13 @@ class Vits(BaseTTS): # init language embedding layer if config.use_language_embedding: + if config.num_languages > 0 and self.language_manager.num_languages == 0: + self.num_languages = config.num_languages + else: + self.num_languages = self.language_manager.num_languages + self.embedded_language_dim = config.embedded_language_dim - self.emb_l = nn.Embedding(self.language_manager.num_languages, self.embedded_language_dim) + self.emb_l = nn.Embedding(self.num_languages, self.embedded_language_dim) torch.nn.init.xavier_uniform_(self.emb_l.weight) else: self.embedded_language_dim = 0 From c334d39acc809ef7884ae18ed9839561eb4c35c0 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 19 Aug 2021 14:05:42 -0300 Subject: [PATCH 113/220] Add voice conversion support for the model VITS trained with external speaker embedding --- TTS/tts/models/vits.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 01eb1874..e7305fb8 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -564,12 +564,21 @@ class Vits(BaseTTS): outputs = {"model_outputs": o, "alignments": attn.squeeze(1), "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p} return outputs - def voice_conversion(self, y, y_lengths, sid_src, sid_tgt): + def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt): """TODO: create an end-point for voice conversion""" assert self.num_speakers > 0, "num_speakers have to be larger than 0." - g_src = self.emb_g(sid_src).unsqueeze(-1) - g_tgt = self.emb_g(sid_tgt).unsqueeze(-1) - z, _, _, y_mask = self.enc_q(y, y_lengths, g=g_src) + + # speaker embedding + if self.args.use_speaker_embedding and not self.use_d_vector: + g_src = self.emb_g(speaker_cond_src).unsqueeze(-1) + g_tgt = self.emb_g(speaker_cond_tgt).unsqueeze(-1) + elif self.args.use_speaker_embedding and self.use_d_vector: + g_src = F.normalize(speaker_cond_src).unsqueeze(-1) + g_tgt = F.normalize(speaker_cond_tgt).unsqueeze(-1) + else: + raise RuntimeError(" [!] Voice conversion is only supported on multi-speaker models.") + + z, _, _, y_mask = self.posterior_encoder(y, y_lengths, g=g_src) z_p = self.flow(z, y_mask, g=g_src) z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) o_hat = self.waveform_decoder(z_hat * y_mask, g=g_tgt) From f1f016314eb92d38f1b778f0036ee7701ad48a18 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 23 Aug 2021 16:12:31 -0300 Subject: [PATCH 114/220] Fix the bug in M-AILABS formatter --- TTS/tts/datasets/dataset.py | 1 + TTS/tts/datasets/formatters.py | 16 +++++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 7ba97eba..78c6c33d 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -125,6 +125,7 @@ class TTSDataset(Dataset): self.d_vector_mapping = d_vector_mapping self.language_id_mapping = language_id_mapping self.use_noise_augment = use_noise_augment + self.verbose = verbose self.input_seq_computed = False self.rescue_item_idx = 1 diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 51ad892a..651b3197 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -68,14 +68,19 @@ def mailabs(root_path, meta_files=None): recursively. Defaults to None """ speaker_regex = re.compile("by_book/(male|female)/(?P[^/]+)/") - if meta_files is None: + if not meta_files: csv_files = glob(root_path + "/**/metadata.csv", recursive=True) else: csv_files = meta_files + # meta_files = [f.strip() for f in meta_files.split(",")] items = [] for csv_file in csv_files: - txt_file = os.path.join(root_path, csv_file) + if os.path.isfile(csv_file): + txt_file = csv_file + else: + txt_file = os.path.join(root_path, csv_file) + folder = os.path.dirname(txt_file) # determine speaker based on folder structure... speaker_name_match = speaker_regex.search(txt_file) @@ -90,7 +95,7 @@ def mailabs(root_path, meta_files=None): with open(txt_file, "r", encoding="utf-8") as ttf: for line in ttf: cols = line.split("|") - if meta_files is None: + if not meta_files: wav_file = os.path.join(folder, "wavs", cols[0] + ".wav") else: wav_file = os.path.join(root_path, folder.replace("metadata.csv", ""), "wavs", cols[0] + ".wav") @@ -98,7 +103,8 @@ def mailabs(root_path, meta_files=None): text = cols[1].strip() items.append([text, wav_file, speaker_name]) else: - raise RuntimeError("> File %s does not exist!" % (wav_file)) + # M-AI-Labs have some missing samples, so just print the warning + print("> File %s does not exist!" % (wav_file)) return items @@ -214,7 +220,7 @@ def common_voice(root_path, meta_file, ununsed_speakers=None): def libri_tts(root_path, meta_files=None, ununsed_speakers=None): """https://ai.google/tools/datasets/libri-tts/""" items = [] - if meta_files is None: + if not meta_files: meta_files = glob(f"{root_path}/**/*trans.tsv", recursive=True) else: if isinstance(meta_files, str): From c53693c155a0d3ef6a1b982758bf604d3915251d Mon Sep 17 00:00:00 2001 From: Edresson Date: Wed, 25 Aug 2021 16:52:02 -0300 Subject: [PATCH 115/220] Implement vocoder Fine Tuning like SC-GlowTTS paper --- TTS/tts/layers/losses.py | 9 ++- TTS/tts/models/vits.py | 140 ++++++++++++++++++++++++++++++++++----- 2 files changed, 133 insertions(+), 16 deletions(-) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 0ea342e8..145cd1a0 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -598,6 +598,7 @@ class VitsGeneratorLoss(nn.Module): feats_disc_fake, feats_disc_real, loss_duration, + fine_tuning_mode=False, ): """ Shapes: @@ -619,9 +620,15 @@ class VitsGeneratorLoss(nn.Module): mel = self.stft(waveform) mel_hat = self.stft(waveform_hat) # compute losses + + # ignore tts model loss if fine tunning mode is on + if fine_tuning_mode: + loss_kl = 0.0 + else: + loss_kl = self.kl_loss(z_p, logs_q, m_p, logs_p, z_mask.unsqueeze(1)) * self.kl_loss_alpha + loss_feat = self.feature_loss(feats_disc_fake, feats_disc_real) * self.feat_loss_alpha loss_gen = self.generator_loss(scores_disc_fake)[0] * self.gen_loss_alpha - loss_kl = self.kl_loss(z_p, logs_q, m_p, logs_p, z_mask.unsqueeze(1)) * self.kl_loss_alpha loss_mel = torch.nn.functional.l1_loss(mel, mel_hat) * self.mel_loss_alpha loss_duration = torch.sum(loss_duration.float()) * self.dur_loss_alpha loss = loss_kl + loss_feat + loss_mel + loss_gen + loss_duration diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index e7305fb8..ce75d6dd 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -193,6 +193,7 @@ class VitsArgs(Coqpit): use_language_embedding: bool = False embedded_language_dim: int = 4 num_languages: int = 0 + fine_tuning_mode: bool = False class Vits(BaseTTS): @@ -330,6 +331,7 @@ class Vits(BaseTTS): if args.init_discriminator: self.disc = VitsDiscriminator(use_spectral_norm=args.use_spectral_norm_disriminator) + print("FINE TUNING:", self.args.fine_tuning_mode) def init_multispeaker(self, config: Coqpit): """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer @@ -521,6 +523,90 @@ class Vits(BaseTTS): ) return outputs + def forward_fine_tuning( + self, + x: torch.tensor, + x_lengths: torch.tensor, + y: torch.tensor, + y_lengths: torch.tensor, + aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}, + ) -> Dict: + """Forward pass of the model. + + Args: + x (torch.tensor): Batch of input character sequence IDs. + x_lengths (torch.tensor): Batch of input character sequence lengths. + y (torch.tensor): Batch of input spectrograms. + y_lengths (torch.tensor): Batch of input spectrogram lengths. + aux_input (dict, optional): Auxiliary inputs for multi-speaker training. Defaults to {"d_vectors": None, "speaker_ids": None}. + + Returns: + Dict: model outputs keyed by the output name. + + Shapes: + - x: :math:`[B, T_seq]` + - x_lengths: :math:`[B]` + - y: :math:`[B, C, T_spec]` + - y_lengths: :math:`[B]` + - d_vectors: :math:`[B, C, 1]` + - speaker_ids: :math:`[B]` + """ + with torch.no_grad(): + outputs = {} + sid, g, lid = self._set_cond_input(aux_input) + # speaker embedding + if self.args.use_speaker_embedding and sid is not None and not self.use_d_vector: + g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] + + # language embedding + lang_emb=None + if self.args.use_language_embedding and lid is not None: + lang_emb = self.emb_l(lid).unsqueeze(-1) + + x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb) + + # posterior encoder + z, m_q, logs_q, y_mask = self.posterior_encoder(y, y_lengths, g=g) + + # flow layers + z_p = self.flow(z, y_mask, g=g) + + # find the alignment path + attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) + with torch.no_grad(): + o_scale = torch.exp(-2 * logs_p) + # logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1]).unsqueeze(-1) # [b, t, 1] + logp2 = torch.einsum("klm, kln -> kmn", [o_scale, -0.5 * (z_p ** 2)]) + logp3 = torch.einsum("klm, kln -> kmn", [m_p * o_scale, z_p]) + # logp4 = torch.sum(-0.5 * (m_p ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] + logp = logp2 + logp3 + attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() + + # expand prior + m_p = torch.einsum("klmn, kjm -> kjn", [attn, m_p]) + logs_p = torch.einsum("klmn, kjm -> kjn", [attn, logs_p]) + + # get the z after inverse decoder + # ToDo: test if using m_p the result is better (In the SC-GlowTTS paper we used mp instead z_p) + z_f_pred = self.flow(z_p, y_mask, g=g, reverse=True) + z_slice, slice_ids = rand_segment(z_f_pred, y_lengths, self.spec_segment_size) + + o = self.waveform_decoder(z_slice, g=g) + outputs.update( + { + "model_outputs": o, + "alignments": attn.squeeze(1), + "slice_ids": slice_ids, + "z": z, + "z_p": z_p, + "m_p": m_p, + "logs_p": logs_p, + "m_q": m_q, + "logs_q": logs_q, + } + ) + return outputs + def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}): """ Shapes: @@ -599,6 +685,15 @@ class Vits(BaseTTS): if optimizer_idx not in [0, 1]: raise ValueError(" [!] Unexpected `optimizer_idx`.") + # generator pass + if self.args.fine_tuning_mode: + # ToDo: find better place fot it + # force eval mode + self.eval() + # restore train mode for the vocoder part + self.waveform_decoder.train() + self.disc.train() + if optimizer_idx == 0: text_input = batch["text_input"] text_lengths = batch["text_lengths"] @@ -610,13 +705,24 @@ class Vits(BaseTTS): waveform = batch["waveform"] # generator pass - outputs = self.forward( - text_input, - text_lengths, - linear_input.transpose(1, 2), - mel_lengths, - aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids}, - ) + if self.args.fine_tuning_mode: + + # model forward + outputs = self.forward_fine_tuning( + text_input, + text_lengths, + linear_input.transpose(1, 2), + mel_lengths, + aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids}, + ) + else: + outputs = self.forward( + text_input, + text_lengths, + linear_input.transpose(1, 2), + mel_lengths, + aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids}, + ) # cache tensors for the discriminator self.y_disc_cache = None @@ -649,15 +755,17 @@ class Vits(BaseTTS): feats_disc_fake=outputs["feats_disc_fake"], feats_disc_real=outputs["feats_disc_real"], loss_duration=outputs["loss_duration"], + fine_tuning_mode=self.args.fine_tuning_mode, ) - - # handle the duration loss - if self.args.use_sdp: - loss_dict["nll_duration"] = outputs["nll_duration"] - loss_dict["loss"] += outputs["nll_duration"] - else: - loss_dict["loss_duration"] = outputs["loss_duration"] - loss_dict["loss"] += outputs["loss_duration"] + # ignore duration loss if fine tuning mode is on + if not self.args.fine_tuning_mode: + # handle the duration loss + if self.args.use_sdp: + loss_dict["nll_duration"] = outputs["nll_duration"] + loss_dict["loss"] += outputs["nll_duration"] + else: + loss_dict["loss_duration"] = outputs["loss_duration"] + loss_dict["loss"] += outputs["loss_duration"] elif optimizer_idx == 1: # discriminator pass @@ -853,3 +961,5 @@ class Vits(BaseTTS): if eval: self.eval() assert not self.training + + From 36dcd1145370607302f2c95bca8a8903e9c894d5 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 26 Aug 2021 07:55:41 -0300 Subject: [PATCH 116/220] Fix pylint issues --- TTS/tts/models/vits.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index ce75d6dd..72c4c892 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -706,7 +706,6 @@ class Vits(BaseTTS): # generator pass if self.args.fine_tuning_mode: - # model forward outputs = self.forward_fine_tuning( text_input, @@ -961,5 +960,3 @@ class Vits(BaseTTS): if eval: self.eval() assert not self.training - - From 7ef3ddc6ffc96eb09f0399e8db29ed970f83f3f9 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 26 Aug 2021 11:36:11 -0300 Subject: [PATCH 117/220] Fix unit tests --- TTS/tts/models/vits.py | 1 - 1 file changed, 1 deletion(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 72c4c892..bc4bf235 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -331,7 +331,6 @@ class Vits(BaseTTS): if args.init_discriminator: self.disc = VitsDiscriminator(use_spectral_norm=args.use_spectral_norm_disriminator) - print("FINE TUNING:", self.args.fine_tuning_mode) def init_multispeaker(self, config: Coqpit): """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer From 959cc8f03c8d0f40f88bf9f5472364812b3c58a2 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 26 Aug 2021 12:18:05 -0300 Subject: [PATCH 118/220] Add VITS multilingual unit test --- .../tts_tests/test_vits_multilingual_train.py | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 tests/tts_tests/test_vits_multilingual_train.py diff --git a/tests/tts_tests/test_vits_multilingual_train.py b/tests/tts_tests/test_vits_multilingual_train.py new file mode 100644 index 00000000..5fc4787d --- /dev/null +++ b/tests/tts_tests/test_vits_multilingual_train.py @@ -0,0 +1,66 @@ +import glob +import os +import shutil + +from tests import get_device_id, get_tests_output_path, run_cli +from TTS.tts.configs import BaseDatasetConfig, VitsConfig + +config_path = os.path.join(get_tests_output_path(), "test_model_config.json") +output_path = os.path.join(get_tests_output_path(), "train_outputs") + + +dataset_config1 = BaseDatasetConfig( + name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", path="tests/data/ljspeech", language="en" +) + +dataset_config2 = BaseDatasetConfig( + name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", path="tests/data/ljspeech", language="en2" +) + +config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + use_espeak_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech", None, "en"], + ["Be a voice, not an echo.", "ljspeech", None, "en2"], + ], + datasets=[dataset_config1, dataset_config2], +) +# set audio config +config.audio.do_trim_silence = True +config.audio.trim_db = 60 + +# active multilingual mode +config.model_args.use_language_embedding = True +# active language sampler +config.use_language_weighted_sampler = True + +config.save_json(config_path) + +# train the model for one epoch +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.test_delay_epochs 0" +) +run_cli(command_train) + +# Find latest folder +continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + +# restore the model and continue training for one more epoch +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " +run_cli(command_train) +shutil.rmtree(continue_path) From e68b0424930f70d188941407ba6d0b886674a8b0 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 26 Aug 2021 12:19:01 -0300 Subject: [PATCH 119/220] Add VITS d-vector unit test --- tests/tts_tests/test_vits_d-vectors_train.py | 63 ++++++++++++++++++++ tests/tts_tests/test_vits_train.py | 2 +- 2 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 tests/tts_tests/test_vits_d-vectors_train.py diff --git a/tests/tts_tests/test_vits_d-vectors_train.py b/tests/tts_tests/test_vits_d-vectors_train.py new file mode 100644 index 00000000..af0e0eba --- /dev/null +++ b/tests/tts_tests/test_vits_d-vectors_train.py @@ -0,0 +1,63 @@ +import glob +import os +import shutil + +from tests import get_device_id, get_tests_output_path, run_cli +from TTS.tts.configs import VitsConfig + +config_path = os.path.join(get_tests_output_path(), "test_model_config.json") +output_path = os.path.join(get_tests_output_path(), "train_outputs") + + +config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + use_espeak_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech-0"], + ], +) +# set audio config +config.audio.do_trim_silence = True +config.audio.trim_db = 60 + +# active multispeaker d-vec mode +config.model_args.use_speaker_embedding = True +config.model_args.use_d_vector_file = True +config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json" +config.model_args.d_vector_dim = 256 + + +config.save_json(config_path) + +# train the model for one epoch +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.name ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" +) +run_cli(command_train) + +# Find latest folder +continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + +# restore the model and continue training for one more epoch +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " +run_cli(command_train) +shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_vits_train.py b/tests/tts_tests/test_vits_train.py index 6398955e..607f7b29 100644 --- a/tests/tts_tests/test_vits_train.py +++ b/tests/tts_tests/test_vits_train.py @@ -25,7 +25,7 @@ config = VitsConfig( print_step=1, print_eval=True, test_sentences=[ - "Be a voice, not an echo.", + ["Be a voice, not an echo."], ], ) config.audio.do_trim_silence = True From 76251b619a14635184ea4983436231e0139ef11a Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 26 Aug 2021 14:47:53 -0300 Subject: [PATCH 120/220] Fix d-vector multispeaker training bug --- TTS/tts/models/base_tts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index bfa6df14..c03a7df5 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -134,13 +134,13 @@ class BaseTTS(BaseModel): # get speaker id/d_vector speaker_id, d_vector, language_id = None, None, None - if hasattr(self, "speaker_manager") and config.use_speaker_embedding: + if hasattr(self, "speaker_manager"): if config.use_d_vector_file: if speaker_name is None: d_vector = self.speaker_manager.get_random_d_vector() else: d_vector = self.speaker_manager.get_d_vector_by_speaker(speaker_name) - else: + elif config.use_speaker_embedding: if speaker_name is None: speaker_id = self.speaker_manager.get_random_speaker_id() else: @@ -284,7 +284,7 @@ class BaseTTS(BaseModel): use_noise_augment=False if is_eval else config.use_noise_augment, verbose=verbose, speaker_id_mapping=speaker_id_mapping, - d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None, + d_vector_mapping=d_vector_mapping, language_id_mapping=language_id_mapping, ) From 9be5b75da3e3ab6e6df36d5565d188aa90931290 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 26 Aug 2021 16:01:07 -0300 Subject: [PATCH 121/220] Fix bug after merge --- TTS/tts/models/vits.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index bc4bf235..600a9551 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -5,6 +5,7 @@ from itertools import chain from typing import Dict, List, Tuple import torch +import math from coqpit import Coqpit from torch import nn from torch.cuda.amp.autocast_mode import autocast @@ -574,11 +575,11 @@ class Vits(BaseTTS): attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) with torch.no_grad(): o_scale = torch.exp(-2 * logs_p) - # logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1]).unsqueeze(-1) # [b, t, 1] + logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1]).unsqueeze(-1) # [b, t, 1] logp2 = torch.einsum("klm, kln -> kmn", [o_scale, -0.5 * (z_p ** 2)]) logp3 = torch.einsum("klm, kln -> kmn", [m_p * o_scale, z_p]) - # logp4 = torch.sum(-0.5 * (m_p ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] - logp = logp2 + logp3 + logp4 = torch.sum(-0.5 * (m_p ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] + logp = logp2 + logp3 + logp1 + logp4 attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() # expand prior From de7855665507bb792df91dab439bc9d3cd18e3d9 Mon Sep 17 00:00:00 2001 From: Edresson Date: Sat, 28 Aug 2021 09:57:52 -0300 Subject: [PATCH 122/220] Fix the optimizer parameters bug in multilingual and multispeaker training --- TTS/tts/models/vits.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 600a9551..d355d5c1 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -882,8 +882,12 @@ class Vits(BaseTTS): self.waveform_decoder.parameters(), ) # add the speaker embedding layer - if hasattr(self, "emb_g"): + if hasattr(self, "emb_g") and self.args.use_speaker_embedding and not self.args.use_d_vector_file: gen_parameters = chain(gen_parameters, self.emb_g.parameters()) + # add the language embedding layer + if hasattr(self, "emb_l") and self.args.use_language_embedding: + gen_parameters = chain(gen_parameters, self.emb_l.parameters()) + optimizer0 = get_optimizer( self.config.optimizer, self.config.optimizer_params, self.config.lr_gen, parameters=gen_parameters ) From 92f7f4f40033cebf04f67e2327240ff6770f0fe9 Mon Sep 17 00:00:00 2001 From: Edresson Date: Sat, 28 Aug 2021 09:59:09 -0300 Subject: [PATCH 123/220] Active the multispeaker mode in multilingual training --- tests/tts_tests/test_vits_multilingual_train.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/tts_tests/test_vits_multilingual_train.py b/tests/tts_tests/test_vits_multilingual_train.py index 5fc4787d..10e66b81 100644 --- a/tests/tts_tests/test_vits_multilingual_train.py +++ b/tests/tts_tests/test_vits_multilingual_train.py @@ -44,6 +44,9 @@ config.audio.trim_db = 60 # active multilingual mode config.model_args.use_language_embedding = True +# active multispeaker mode +config.model_args.use_speaker_embedding = True +config.model_args.use_d_vector_file = False # active language sampler config.use_language_weighted_sampler = True From bbdb5c38e6ca582dd63c76078541a80f935bbec1 Mon Sep 17 00:00:00 2001 From: Edresson Date: Sat, 28 Aug 2021 09:59:48 -0300 Subject: [PATCH 124/220] Add VITS multispeaker train unit test --- .../tts_tests/test_vits_speaker_emb_train.py | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 tests/tts_tests/test_vits_speaker_emb_train.py diff --git a/tests/tts_tests/test_vits_speaker_emb_train.py b/tests/tts_tests/test_vits_speaker_emb_train.py new file mode 100644 index 00000000..7028a983 --- /dev/null +++ b/tests/tts_tests/test_vits_speaker_emb_train.py @@ -0,0 +1,63 @@ +import glob +import os +import shutil + +from tests import get_device_id, get_tests_output_path, run_cli +from TTS.tts.configs import VitsConfig + +config_path = os.path.join(get_tests_output_path(), "test_model_config.json") +output_path = os.path.join(get_tests_output_path(), "train_outputs") + + +config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + use_espeak_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech"], + ], +) +# set audio config +config.audio.do_trim_silence = True +config.audio.trim_db = 60 + +# active multispeaker d-vec mode +config.model_args.use_speaker_embedding = True +config.model_args.use_d_vector_file = False +config.model_args.d_vector_file = None +config.model_args.d_vector_dim = 256 + + +config.save_json(config_path) + +# train the model for one epoch +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.name ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" +) +run_cli(command_train) + +# Find latest folder +continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + +# restore the model and continue training for one more epoch +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " +run_cli(command_train) +shutil.rmtree(continue_path) From 0bdfd3cb5076ec967e27ed58ed191232e0a1773f Mon Sep 17 00:00:00 2001 From: Edresson Date: Sat, 28 Aug 2021 10:11:33 -0300 Subject: [PATCH 125/220] Add the ValueError in the restore checkpoint exception to avoid problems with the optimizer restauration when new keys are addition --- TTS/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index 2175875c..e8911ba3 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -453,7 +453,7 @@ class Trainer: if "scaler" in checkpoint and self.use_amp_scaler and checkpoint["scaler"]: print(" > Restoring Scaler...") scaler = _restore_list_objs(checkpoint["scaler"], scaler) - except (KeyError, RuntimeError): + except (KeyError, RuntimeError, ValueError): print(" > Partial model initialization...") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint["model"], config) From 9b011b1cb3849150d4cdb2ee06d022771ad7aee6 Mon Sep 17 00:00:00 2001 From: Edresson Date: Wed, 1 Sep 2021 09:23:45 -0300 Subject: [PATCH 126/220] Add H/ASP original checkpoint support --- TTS/speaker_encoder/models/resnet.py | 39 ++++++++++++++++++++-- TTS/speaker_encoder/utils/generic_utils.py | 6 +++- TTS/tts/utils/speakers.py | 14 +++++--- 3 files changed, 51 insertions(+), 8 deletions(-) diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index fcc850d7..beeb5ae1 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -1,9 +1,23 @@ import numpy as np import torch -from torch import nn +import torchaudio +import torch.nn as nn from TTS.utils.io import load_fsspec +class PreEmphasis(torch.nn.Module): + def __init__(self, coefficient=0.97): + super().__init__() + self.coefficient = coefficient + self.register_buffer( + 'filter', torch.FloatTensor([-self.coefficient, 1.]).unsqueeze(0).unsqueeze(0) + ) + + def forward(self, x): + assert len(x.size()) == 2 + + x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), 'reflect') + return torch.nn.functional.conv1d(x, self.filter).squeeze(1) class SELayer(nn.Module): def __init__(self, channel, reduction=8): @@ -70,12 +84,17 @@ class ResNetSpeakerEncoder(nn.Module): num_filters=[32, 64, 128, 256], encoder_type="ASP", log_input=False, + use_torch_spec=False, + audio_config=None, ): super(ResNetSpeakerEncoder, self).__init__() self.encoder_type = encoder_type self.input_dim = input_dim self.log_input = log_input + self.use_torch_spec = use_torch_spec + self.audio_config = audio_config + self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1) self.relu = nn.ReLU(inplace=True) self.bn1 = nn.BatchNorm2d(num_filters[0]) @@ -88,6 +107,14 @@ class ResNetSpeakerEncoder(nn.Module): self.instancenorm = nn.InstanceNorm1d(input_dim) + if self.use_torch_spec: + self.torch_spec = torch.nn.Sequential( + PreEmphasis(audio_config["preemphasis"]), + torchaudio.transforms.MelSpectrogram(sample_rate=audio_config["sample_rate"], n_fft=audio_config["fft_size"], win_length=audio_config["win_length"], hop_length=audio_config["hop_length"], window_fn=torch.hamming_window, n_mels=audio_config["num_mels"]) + ) + else: + self.torch_spec = None + outmap_size = int(self.input_dim / 8) self.attention = nn.Sequential( @@ -140,9 +167,13 @@ class ResNetSpeakerEncoder(nn.Module): return out def forward(self, x, l2_norm=False): - x = x.transpose(1, 2) with torch.no_grad(): with torch.cuda.amp.autocast(enabled=False): + if self.use_torch_spec: + x = self.torch_spec(x) + else: + x = x.transpose(1, 2) + if self.log_input: x = (x + 1e-6).log() x = self.instancenorm(x).unsqueeze(1) @@ -180,6 +211,10 @@ class ResNetSpeakerEncoder(nn.Module): Generate embeddings for a batch of utterances x: 1xTxD """ + # map to the waveform size + if self.use_torch_spec: + num_frames = num_frames * self.audio_config['hop_length'] + max_len = x.shape[1] if max_len < num_frames: diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/speaker_encoder/utils/generic_utils.py index 1981fbe9..3714e3c4 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/speaker_encoder/utils/generic_utils.py @@ -179,7 +179,11 @@ def setup_model(c): c.model_params["num_lstm_layers"], ) elif c.model_params["model_name"].lower() == "resnet": - model = ResNetSpeakerEncoder(input_dim=c.model_params["input_dim"], proj_dim=c.model_params["proj_dim"]) + model = ResNetSpeakerEncoder(input_dim=c.model_params["input_dim"], proj_dim=c.model_params["proj_dim"], + log_input=c.model_params.get("log_input", False), + use_torch_spec=c.model_params.get("use_torch_spec", False), + audio_config=c.audio + ) return model diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 1497ca74..282875af 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -288,12 +288,16 @@ class SpeakerManager: def _compute(wav_file: str): waveform = self.speaker_encoder_ap.load_wav(wav_file, sr=self.speaker_encoder_ap.sample_rate) - spec = self.speaker_encoder_ap.melspectrogram(waveform) - spec = torch.from_numpy(spec.T) + if not self.speaker_encoder_config.model_params.get("use_torch_spec", False): + m_input = self.speaker_encoder_ap.melspectrogram(waveform) + m_input = torch.from_numpy(m_input.T) + else: + m_input = torch.from_numpy(waveform) + if self.use_cuda: - spec = spec.cuda() - spec = spec.unsqueeze(0) - d_vector = self.speaker_encoder.compute_embedding(spec) + m_input = m_input.cuda() + m_input = m_input.unsqueeze(0) + d_vector = self.speaker_encoder.compute_embedding(m_input) return d_vector if isinstance(wav_file, list): From 690b37d0abbe8e225c48494618eb1e96625ac17a Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 3 Sep 2021 07:37:43 -0300 Subject: [PATCH 127/220] Add support to use the speaker encoder as loss function in VITS model --- TTS/tts/configs/vits_config.py | 1 + TTS/tts/layers/losses.py | 10 ++++++ TTS/tts/models/vits.py | 57 +++++++++++++++++++++++++++------- 3 files changed, 56 insertions(+), 12 deletions(-) diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index cc3e4940..ece414a6 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -117,6 +117,7 @@ class VitsConfig(BaseTTSConfig): feat_loss_alpha: float = 1.0 mel_loss_alpha: float = 45.0 dur_loss_alpha: float = 1.0 + speaker_encoder_loss_alpha: float = 1.0 # data loader params return_wav: bool = True diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 145cd1a0..fdee9c10 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -532,6 +532,7 @@ class VitsGeneratorLoss(nn.Module): self.feat_loss_alpha = c.feat_loss_alpha self.dur_loss_alpha = c.dur_loss_alpha self.mel_loss_alpha = c.mel_loss_alpha + self.spk_encoder_loss_alpha = c.speaker_encoder_loss_alpha self.stft = TorchSTFT( c.audio.fft_size, c.audio.hop_length, @@ -599,6 +600,9 @@ class VitsGeneratorLoss(nn.Module): feats_disc_real, loss_duration, fine_tuning_mode=False, + use_speaker_encoder_as_loss=False, + gt_spk_emb=None, + syn_spk_emb=None ): """ Shapes: @@ -632,6 +636,12 @@ class VitsGeneratorLoss(nn.Module): loss_mel = torch.nn.functional.l1_loss(mel, mel_hat) * self.mel_loss_alpha loss_duration = torch.sum(loss_duration.float()) * self.dur_loss_alpha loss = loss_kl + loss_feat + loss_mel + loss_gen + loss_duration + + if use_speaker_encoder_as_loss: + loss_se = - torch.nn.functional.cosine_similarity(gt_spk_emb, syn_spk_emb).mean() * self.spk_encoder_loss_alpha + loss += loss_se + return_dict["loss_spk_encoder"] = loss_se + # pass losses to the dict return_dict["loss_gen"] = loss_gen return_dict["loss_kl"] = loss_kl diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index d355d5c1..71cc4634 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -195,6 +195,10 @@ class VitsArgs(Coqpit): embedded_language_dim: int = 4 num_languages: int = 0 fine_tuning_mode: bool = False + use_speaker_encoder_as_loss: bool = False + speaker_encoder_config_path: str = "" + speaker_encoder_model_path: str = "" + class Vits(BaseTTS): @@ -370,6 +374,18 @@ class Vits(BaseTTS): self.speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file) self.embedded_speaker_dim = config.d_vector_dim + if config.use_speaker_encoder_as_loss: + if not config.speaker_encoder_model_path or not config.speaker_encoder_config_path: + raise RuntimeError(" [!] To use the speaker encoder loss you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!") + self.speaker_manager.init_speaker_encoder(config.speaker_encoder_model_path, config.speaker_encoder_config_path) + self.speaker_encoder = self.speaker_manager.speaker_encoder.train() + for param in self.speaker_encoder.parameters(): + param.requires_grad = False + + print(" > External Speaker Encoder Loaded !!") + else: + self.speaker_encoder = None + def init_multilingual(self, config: Coqpit, data: List = None): """Initialize multilingual modules of a model. @@ -427,6 +443,7 @@ class Vits(BaseTTS): y: torch.tensor, y_lengths: torch.tensor, aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}, + waveform=None, ) -> Dict: """Forward pass of the model. @@ -461,7 +478,6 @@ class Vits(BaseTTS): x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb) - # posterior encoder z, m_q, logs_q, y_mask = self.posterior_encoder(y, y_lengths, g=g) @@ -508,17 +524,36 @@ class Vits(BaseTTS): # select a random feature segment for the waveform decoder z_slice, slice_ids = rand_segments(z, y_lengths, self.spec_segment_size) o = self.waveform_decoder(z_slice, g=g) + + wav_seg = segment( + waveform.transpose(1, 2), + slice_ids * self.config.audio.hop_length, + self.args.spec_segment_size * self.config.audio.hop_length, + ) + + if self.args.use_speaker_encoder_as_loss: + # concate generated and GT waveforms + wavs_batch = torch.cat((wav_seg, o), dim=0).squeeze(1) + pred_embs = self.speaker_encoder.forward(wavs_batch, l2_norm=True) + + # split generated and GT speaker embeddings + gt_spk_emb, syn_spk_emb = torch.chunk(pred_embs, 2, dim=0) + else: + gt_spk_emb, syn_spk_emb = None, None + outputs.update( { "model_outputs": o, "alignments": attn.squeeze(1), - "slice_ids": slice_ids, "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p, "m_q": m_q, "logs_q": logs_q, + "waveform_seg": wav_seg, + "gt_spk_emb": gt_spk_emb, + "syn_spk_emb": syn_spk_emb } ) return outputs @@ -596,7 +631,6 @@ class Vits(BaseTTS): { "model_outputs": o, "alignments": attn.squeeze(1), - "slice_ids": slice_ids, "z": z, "z_p": z_p, "m_p": m_p, @@ -713,6 +747,7 @@ class Vits(BaseTTS): linear_input.transpose(1, 2), mel_lengths, aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids}, + waveform=waveform, ) else: outputs = self.forward( @@ -721,30 +756,25 @@ class Vits(BaseTTS): linear_input.transpose(1, 2), mel_lengths, aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids}, + waveform=waveform, ) # cache tensors for the discriminator self.y_disc_cache = None self.wav_seg_disc_cache = None self.y_disc_cache = outputs["model_outputs"] - wav_seg = segment( - waveform.transpose(1, 2), - outputs["slice_ids"] * self.config.audio.hop_length, - self.args.spec_segment_size * self.config.audio.hop_length, - ) - self.wav_seg_disc_cache = wav_seg - outputs["waveform_seg"] = wav_seg + self.wav_seg_disc_cache = outputs["waveform_seg"] # compute discriminator scores and features outputs["scores_disc_fake"], outputs["feats_disc_fake"], _, outputs["feats_disc_real"] = self.disc( - outputs["model_outputs"], wav_seg + outputs["model_outputs"], outputs["waveform_seg"] ) # compute losses with autocast(enabled=False): # use float32 for the criterion loss_dict = criterion[optimizer_idx]( waveform_hat=outputs["model_outputs"].float(), - waveform=wav_seg.float(), + waveform= outputs["waveform_seg"].float(), z_p=outputs["z_p"].float(), logs_q=outputs["logs_q"].float(), m_p=outputs["m_p"].float(), @@ -755,6 +785,9 @@ class Vits(BaseTTS): feats_disc_real=outputs["feats_disc_real"], loss_duration=outputs["loss_duration"], fine_tuning_mode=self.args.fine_tuning_mode, + use_speaker_encoder_as_loss=self.args.use_speaker_encoder_as_loss, + gt_spk_emb=outputs["gt_spk_emb"], + syn_spk_emb=outputs["syn_spk_emb"] ) # ignore duration loss if fine tuning mode is on if not self.args.fine_tuning_mode: From 6ac31e4152a6e51fdb0200770fbca241a7eda3f3 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 21 Jul 2021 16:49:12 +0200 Subject: [PATCH 128/220] Added a notbook for d-vector multilingual VITS --- .../VITS_d-vector_multilingual_exemple.ipynb | 223 ++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 notebooks/VITS_d-vector_multilingual_exemple.ipynb diff --git a/notebooks/VITS_d-vector_multilingual_exemple.ipynb b/notebooks/VITS_d-vector_multilingual_exemple.ipynb new file mode 100644 index 00000000..41713295 --- /dev/null +++ b/notebooks/VITS_d-vector_multilingual_exemple.ipynb @@ -0,0 +1,223 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "import IPython\n", + "import torch\n", + "\n", + "from IPython.display import Audio\n", + "\n", + "from TTS.config import load_config\n", + "from TTS.tts.models import setup_model\n", + "from TTS.tts.utils.synthesis import synthesis\n", + "from TTS.utils.audio import AudioProcessor" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "GENERAL_PATH = '/home/julian/workspace/train/VITS-pt-en-fr-lr/vits-August-29-2021_01+20PM-c68d7fa25/'\n", + "MODEL_PATH = GENERAL_PATH + 'best_model.pth.tar'\n", + "CONFIG_PATH = GENERAL_PATH + 'config.json'\n", + "TTS_LANGUAGES = GENERAL_PATH + \"language_ids.json\"\n", + "TTS_SPEAKERS = GENERAL_PATH + \"speakers.json\"\n", + "USE_CUDA = torch.cuda.is_available()" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ + "# load the config\n", + "C = load_config(CONFIG_PATH)\n", + "\n", + "# load the audio processor\n", + "ap = AudioProcessor(**C.audio)\n", + "\n", + "speaker_embedding = None\n", + "\n", + "C.model_args['d_vector_file'] = TTS_SPEAKERS\n", + "\n", + "model = setup_model(C)\n", + "model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)\n", + "cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n", + "model.load_state_dict(cp['model'])\n", + "\n", + "\n", + "model.eval()\n", + "\n", + "if USE_CUDA:\n", + " model = model.cuda()\n", + "\n", + "use_griffin_lim = True" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " > Setting up Audio Processor...\n", + " | > sample_rate:16000\n", + " | > resample:False\n", + " | > num_mels:80\n", + " | > min_level_db:-100\n", + " | > frame_shift_ms:None\n", + " | > frame_length_ms:None\n", + " | > ref_level_db:20\n", + " | > fft_size:1024\n", + " | > power:1.5\n", + " | > preemphasis:0.0\n", + " | > griffin_lim_iters:60\n", + " | > signal_norm:False\n", + " | > symmetric_norm:True\n", + " | > mel_fmin:0\n", + " | > mel_fmax:None\n", + " | > spec_gain:1.0\n", + " | > stft_pad_mode:reflect\n", + " | > max_norm:4.0\n", + " | > clip_norm:True\n", + " | > do_trim_silence:True\n", + " | > trim_db:45\n", + " | > do_sound_norm:False\n", + " | > do_amp_to_db_linear:False\n", + " | > do_amp_to_db_mel:True\n", + " | > stats_path:None\n", + " | > base:2.718281828459045\n", + " | > hop_length:256\n", + " | > win_length:1024\n", + " > Using model: vits\n", + " > Speaker manager is loaded with 421 speakers: ED, MLS_10032, MLS_10058, MLS_10065, MLS_10082, MLS_10087, MLS_10177, MLS_103, MLS_10620, MLS_10827, MLS_10957, MLS_112, MLS_11247, MLS_1127, MLS_115, MLS_11743, MLS_11772, MLS_11795, MLS_11822, MLS_11875, MLS_11954, MLS_12205, MLS_123, MLS_1243, MLS_125, MLS_12501, MLS_12512, MLS_12541, MLS_12709, MLS_12713, MLS_12823, MLS_12899, MLS_12968, MLS_12981, MLS_13142, MLS_13177, MLS_1329, MLS_13611, MLS_13634, MLS_13655, MLS_13658, MLS_14, MLS_1474, MLS_1579, MLS_1590, MLS_1591, MLS_1624, MLS_1649, MLS_1664, MLS_1745, MLS_177, MLS_1770, MLS_1798, MLS_1805, MLS_1817, MLS_1840, MLS_1844, MLS_1869, MLS_1887, MLS_1977, MLS_1989, MLS_2033, MLS_204, MLS_2155, MLS_2284, MLS_2297, MLS_2316, MLS_2506, MLS_2544, MLS_2587, MLS_2596, MLS_2607, MLS_27, MLS_2771, MLS_2776, MLS_28, MLS_2825, MLS_2904, MLS_2926, MLS_2946, MLS_30, MLS_3060, MLS_3182, MLS_3190, MLS_3204, MLS_3267, MLS_3270, MLS_3319, MLS_3344, MLS_3370, MLS_3464, MLS_3503, MLS_3595, MLS_3698, MLS_4018, MLS_4174, MLS_4193, MLS_4336, MLS_4396, MLS_4512, MLS_4609, MLS_4650, MLS_4699, MLS_4724, MLS_4744, MLS_4937, MLS_5021, MLS_5077, MLS_52, MLS_5232, MLS_5295, MLS_5525, MLS_5526, MLS_5553, MLS_5595, MLS_5612, MLS_5764, MLS_577, MLS_579, MLS_5830, MLS_5840, MLS_5968, MLS_6070, MLS_6128, MLS_62, MLS_6249, MLS_6318, MLS_6348, MLS_6362, MLS_6381, MLS_66, MLS_6856, MLS_694, MLS_7032, MLS_707, MLS_7142, MLS_7150, MLS_7193, MLS_7200, MLS_7239, MLS_7377, MLS_7423, MLS_7438, MLS_7439, MLS_753, MLS_7591, MLS_7601, MLS_7614, MLS_7679, MLS_78, MLS_7848, MLS_8102, MLS_8128, MLS_8582, MLS_8778, MLS_9121, MLS_9242, MLS_928, MLS_94, MLS_9804, MLS_9854, VCTK_p225, VCTK_p226, VCTK_p227, VCTK_p228, VCTK_p229, VCTK_p230, VCTK_p231, VCTK_p232, VCTK_p233, VCTK_p234, VCTK_p236, VCTK_p237, VCTK_p238, VCTK_p239, VCTK_p240, VCTK_p241, VCTK_p243, VCTK_p244, VCTK_p245, VCTK_p246, VCTK_p247, VCTK_p248, VCTK_p249, VCTK_p250, VCTK_p251, VCTK_p252, VCTK_p253, VCTK_p254, VCTK_p255, VCTK_p256, VCTK_p257, VCTK_p258, VCTK_p259, VCTK_p260, VCTK_p261, VCTK_p262, VCTK_p263, VCTK_p264, VCTK_p265, VCTK_p266, VCTK_p267, VCTK_p268, VCTK_p269, VCTK_p270, VCTK_p271, VCTK_p272, VCTK_p273, VCTK_p274, VCTK_p275, VCTK_p276, VCTK_p277, VCTK_p278, VCTK_p279, VCTK_p280, VCTK_p281, VCTK_p282, VCTK_p283, VCTK_p284, VCTK_p285, VCTK_p286, VCTK_p287, VCTK_p288, VCTK_p292, VCTK_p293, VCTK_p294, VCTK_p295, VCTK_p297, VCTK_p298, VCTK_p299, VCTK_p300, VCTK_p301, VCTK_p302, VCTK_p303, VCTK_p304, VCTK_p305, VCTK_p306, VCTK_p307, VCTK_p308, VCTK_p310, VCTK_p311, VCTK_p312, VCTK_p313, VCTK_p314, VCTK_p316, VCTK_p317, VCTK_p318, VCTK_p323, VCTK_p326, VCTK_p329, VCTK_p330, VCTK_p333, VCTK_p334, VCTK_p335, VCTK_p336, VCTK_p339, VCTK_p340, VCTK_p341, VCTK_p343, VCTK_p345, VCTK_p347, VCTK_p351, VCTK_p360, VCTK_p361, VCTK_p362, VCTK_p363, VCTK_p364, VCTK_p374, VCTK_p376, bernard, elodie, ezwa, gilles_g_le_blanc, nadine_eckert_boulet, openSLR_afr0184, openSLR_afr1919, openSLR_afr2418, openSLR_afr6590, openSLR_afr7130, openSLR_afr7214, openSLR_afr8148, openSLR_afr8924, openSLR_afr8963, openSLR_jvf00264, openSLR_jvf00658, openSLR_jvf01392, openSLR_jvf02059, openSLR_jvf02884, openSLR_jvf03187, openSLR_jvf04679, openSLR_jvf04715, openSLR_jvf04982, openSLR_jvf05540, openSLR_jvf06207, openSLR_jvf06510, openSLR_jvf06941, openSLR_jvf07335, openSLR_jvf07638, openSLR_jvf08002, openSLR_jvf08305, openSLR_jvf08736, openSLR_jvf09039, openSLR_jvm00027, openSLR_jvm01519, openSLR_jvm01932, openSLR_jvm02326, openSLR_jvm03314, openSLR_jvm03424, openSLR_jvm03727, openSLR_jvm04175, openSLR_jvm04285, openSLR_jvm04588, openSLR_jvm05219, openSLR_jvm05522, openSLR_jvm05667, openSLR_jvm05970, openSLR_jvm06080, openSLR_jvm06383, openSLR_jvm07765, openSLR_jvm07875, openSLR_jvm08178, openSLR_jvm09724, openSLR_sso0145, openSLR_sso0493, openSLR_sso0806, openSLR_sso1266, openSLR_sso1367, openSLR_sso1801, openSLR_sso2388, openSLR_sso2910, openSLR_sso4592, openSLR_sso5945, openSLR_sso6499, openSLR_sso7801, openSLR_sso7821, openSLR_sso7876, openSLR_sso7912, openSLR_sso7934, openSLR_sso8596, openSLR_sso8777, openSLR_sso9892, openSLR_suf00297, openSLR_suf00600, openSLR_suf00691, openSLR_suf00994, openSLR_suf01056, openSLR_suf01359, openSLR_suf02092, openSLR_suf02395, openSLR_suf02953, openSLR_suf03712, openSLR_suf03887, openSLR_suf04190, openSLR_suf04646, openSLR_suf04748, openSLR_suf05051, openSLR_suf05507, openSLR_suf06543, openSLR_suf07302, openSLR_suf08338, openSLR_suf08703, openSLR_sum00060, openSLR_sum00454, openSLR_sum01038, openSLR_sum01552, openSLR_sum01596, openSLR_sum01855, openSLR_sum01899, openSLR_sum02716, openSLR_sum03391, openSLR_sum03650, openSLR_sum03694, openSLR_sum04208, openSLR_sum04511, openSLR_sum05186, openSLR_sum06003, openSLR_sum06047, openSLR_sum07842, openSLR_sum08659, openSLR_sum09243, openSLR_sum09637, openSLR_sum09757, openSLR_tsn0045, openSLR_tsn0378, openSLR_tsn0441, openSLR_tsn1483, openSLR_tsn1498, openSLR_tsn1932, openSLR_tsn2839, openSLR_tsn3342, openSLR_tsn3629, openSLR_tsn4506, openSLR_tsn4850, openSLR_tsn5628, openSLR_tsn6116, openSLR_tsn6206, openSLR_tsn6234, openSLR_tsn6459, openSLR_tsn7674, openSLR_tsn7693, openSLR_tsn7866, openSLR_tsn7896, openSLR_tsn8333, openSLR_tsn8512, openSLR_tsn8532, openSLR_tsn8914, openSLR_tsn9061, openSLR_tsn9365, openSLR_xho0050, openSLR_xho0120, openSLR_xho1547, openSLR_xho3616, openSLR_xho4280, openSLR_xho4291, openSLR_xho5378, openSLR_xho5680, openSLR_xho6975, openSLR_xho7590, openSLR_xho7599, openSLR_xho9446, zeckou\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "#set speaker\n", + "d_vector = model.speaker_manager.get_mean_d_vector('VCTK_p260')" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 5, + "source": [ + "model.language_manager.language_id_mapping" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'af': 0,\n", + " 'en': 1,\n", + " 'fr-fr': 2,\n", + " 'jv': 3,\n", + " 'pt-br': 4,\n", + " 'st': 5,\n", + " 'su': 6,\n", + " 'tn': 7,\n", + " 'xh': 8}" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ], + "metadata": { + "scrolled": true + } + }, + { + "cell_type": "code", + "execution_count": 6, + "source": [ + "# set scales \n", + "model.noise_scale = 0.0 # defines the noise variance applied to the random z vector at inference.\n", + "model.length_scale = 1.0 # scaler for the duration predictor. The larger it is, the slower the speech.\n", + "model.noise_scale_w = 0.0 # defines the noise variance applied to the duration predictor z vector at inference.\n", + "model.inference_noise_scale = 0.5 # defines the noise variance applied to the random z vector at inference.\n", + "model.inference_noise_scale_dp = 0.6 # defines the noise variance applied to the duration predictor z vector at inference." + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 7, + "source": [ + "text = \"Il m'a fallu beaucoup de temps pour développer une voix, et maintenant que je l'ai, je ne vais pas me taire.\"\n", + "language_id = 2\n", + "wav, alignment, _, _ = synthesis(\n", + " model,\n", + " text,\n", + " C,\n", + " \"cuda\" in str(next(model.parameters()).device),\n", + " ap,\n", + " speaker_id=None,\n", + " d_vector=d_vector,\n", + " style_wav=None,\n", + " language_id=language_id,\n", + " enable_eos_bos_chars=C.enable_eos_bos_chars,\n", + " use_griffin_lim=True,\n", + " do_trim_silence=False,\n", + " ).values()\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {} + } + ], + "metadata": {} + } + ], + "metadata": { + "interpreter": { + "hash": "b925b73899c1545aa2d9bbcf4e8e1df4138a367d2daefc2707570579325ca4c0" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.10 64-bit ('TTS': conda)" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file From eeb8ac07d964db769fd6eea01c8669739a60e3de Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 14 Sep 2021 17:27:00 -0300 Subject: [PATCH 129/220] Add voice conversion fine tuning mode --- TTS/bin/find_unique_phonemes.py | 63 +++++++++++++++++++++++++++++++++ TTS/tts/layers/losses.py | 2 +- TTS/tts/models/vits.py | 59 +++++++++++++++++++++++++++--- 3 files changed, 119 insertions(+), 5 deletions(-) create mode 100644 TTS/bin/find_unique_phonemes.py diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py new file mode 100644 index 00000000..7ed79b36 --- /dev/null +++ b/TTS/bin/find_unique_phonemes.py @@ -0,0 +1,63 @@ +"""Find all the unique characters in a dataset""" +import argparse +from argparse import RawTextHelpFormatter + +from TTS.config import load_config +from TTS.tts.datasets import load_meta_data + +import numpy +import multiprocessing +from TTS.tts.utils.text import text2phone +from tqdm.contrib.concurrent import process_map + +def compute_phonemes(item): + try: + text = item[0] + language = item[-1] + ph = text2phone(text, language, use_espeak_phonemes=c.use_espeak_phonemes).split("|") + except: + return [] + return list(set(ph)) + +def main(): + global c + # pylint: disable=bad-option-value + parser = argparse.ArgumentParser( + description="""Find all the unique characters or phonemes in a dataset.\n\n""" + """ + Example runs: + + python TTS/bin/find_unique_chars.py --config_path config.json + """, + formatter_class=RawTextHelpFormatter, + ) + parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True) + args = parser.parse_args() + + c = load_config(args.config_path) + + # load all datasets + train_items, eval_items = load_meta_data(c.datasets, eval_split=True) + items = train_items + eval_items + print("Num items:", len(items)) + # items = items[:1000] + + phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15) + phones = [] + for ph in phonemes: + phones.extend(ph) + phones = set(phones) + lower_phones = filter(lambda c: c.islower(), phones) + phones_force_lower = [c.lower() for c in phones] + phones_force_lower = set(phones_force_lower) + + + + print(f" > Number of unique phonemes: {len(phones)}") + print(f" > Unique phonemes: {''.join(sorted(phones))}") + print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}") + print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}") + + +if __name__ == "__main__": + main() diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index fdee9c10..cd2903b0 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -599,7 +599,7 @@ class VitsGeneratorLoss(nn.Module): feats_disc_fake, feats_disc_real, loss_duration, - fine_tuning_mode=False, + fine_tuning_mode=0, use_speaker_encoder_as_loss=False, gt_spk_emb=None, syn_spk_emb=None diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 71cc4634..a9078b26 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -149,6 +149,28 @@ class VitsArgs(Coqpit): detach_dp_input (bool): Detach duration predictor's input from the network for stopping the gradients. Defaults to True. + + use_language_embedding (bool): + Enable/Disable language embedding for multilingual models. Defaults to False. + + embedded_language_dim (int): + Number of language embedding channels. Defaults to 4. + + num_languages (int): + Number of languages for the language embedding layer. Defaults to 0. + + use_speaker_encoder_as_loss (bool): + + + use_speaker_encoder_as_loss: bool = False + speaker_encoder_config_path: str = "" + speaker_encoder_model_path: str = "" + + fine_tuning_mode (int): + Fine tuning only the vocoder part of the model, while the rest will be frozen. Defaults to 0. + Mode 0: disabled; + Mode 1: uses the distribution predicted by the encoder and It's recommended for TTS; + Mode 2: uses the distribution predicted by the encoder and It's recommended for voice conversion. """ num_chars: int = 100 @@ -194,10 +216,10 @@ class VitsArgs(Coqpit): use_language_embedding: bool = False embedded_language_dim: int = 4 num_languages: int = 0 - fine_tuning_mode: bool = False use_speaker_encoder_as_loss: bool = False speaker_encoder_config_path: str = "" speaker_encoder_model_path: str = "" + fine_tuning_mode: int = 0 @@ -565,6 +587,7 @@ class Vits(BaseTTS): y: torch.tensor, y_lengths: torch.tensor, aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}, + waveform=None, ) -> Dict: """Forward pass of the model. @@ -621,22 +644,50 @@ class Vits(BaseTTS): m_p = torch.einsum("klmn, kjm -> kjn", [attn, m_p]) logs_p = torch.einsum("klmn, kjm -> kjn", [attn, logs_p]) - # get the z after inverse decoder - # ToDo: test if using m_p the result is better (In the SC-GlowTTS paper we used mp instead z_p) - z_f_pred = self.flow(z_p, y_mask, g=g, reverse=True) + # mode 1: like SC-GlowTTS paper; mode 2: recommended for voice conversion + if self.args.fine_tuning_mode == 1: + z_ft = m_p + elif self.args.fine_tuning_mode == 2: + z_ft = z_p + else: + raise RuntimeError(" [!] Invalid Fine Tunning Mode !") + + # inverse decoder and get the output + z_f_pred = self.flow(z_ft, y_mask, g=g, reverse=True) z_slice, slice_ids = rand_segment(z_f_pred, y_lengths, self.spec_segment_size) o = self.waveform_decoder(z_slice, g=g) + + wav_seg = segment( + waveform.transpose(1, 2), + slice_ids * self.config.audio.hop_length, + self.args.spec_segment_size * self.config.audio.hop_length, + ) + + if self.args.use_speaker_encoder_as_loss: + # concate generated and GT waveforms + wavs_batch = torch.cat((wav_seg, o), dim=0).squeeze(1) + pred_embs = self.speaker_encoder.forward(wavs_batch, l2_norm=True) + + # split generated and GT speaker embeddings + gt_spk_emb, syn_spk_emb = torch.chunk(pred_embs, 2, dim=0) + else: + gt_spk_emb, syn_spk_emb = None, None + outputs.update( { "model_outputs": o, "alignments": attn.squeeze(1), + "loss_duration": 0.0, "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p, "m_q": m_q, "logs_q": logs_q, + "waveform_seg": wav_seg, + "gt_spk_emb": gt_spk_emb, + "syn_spk_emb": syn_spk_emb } ) return outputs From 9de45394228b34a537da7c4a1c3f74881036ba43 Mon Sep 17 00:00:00 2001 From: Edresson Date: Sun, 19 Sep 2021 13:29:09 -0300 Subject: [PATCH 130/220] Update the VITS model docs --- TTS/tts/models/vits.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index a9078b26..334e4526 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -159,16 +159,18 @@ class VitsArgs(Coqpit): num_languages (int): Number of languages for the language embedding layer. Defaults to 0. - use_speaker_encoder_as_loss (bool): - + use_speaker_encoder_as_loss (bool): + Enable/Disable Speaker Consistency Loss (SCL). Defaults to False. - use_speaker_encoder_as_loss: bool = False - speaker_encoder_config_path: str = "" - speaker_encoder_model_path: str = "" + speaker_encoder_config_path (str): + Path to the file speaker encoder config file, to use for SCL. Defaults to "". + + speaker_encoder_model_path (str): + Path to the file speaker encoder checkpoint file, to use for SCL. Defaults to "". fine_tuning_mode (int): Fine tuning only the vocoder part of the model, while the rest will be frozen. Defaults to 0. - Mode 0: disabled; + Mode 0: Disabled; Mode 1: uses the distribution predicted by the encoder and It's recommended for TTS; Mode 2: uses the distribution predicted by the encoder and It's recommended for voice conversion. """ From 005bba60b018a804a0752ec77973b11aba70ab4b Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Sun, 19 Sep 2021 23:34:38 +0200 Subject: [PATCH 131/220] get_speaker_weighted_sampler --- TTS/tts/models/base_tts.py | 5 ++++- TTS/tts/utils/speakers.py | 9 +++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index c03a7df5..9d722222 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -12,7 +12,7 @@ from torch.utils.data.distributed import DistributedSampler from TTS.model import BaseModel from TTS.tts.configs.shared_configs import CharactersConfig from TTS.tts.datasets.dataset import TTSDataset -from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager +from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager, get_speaker_weighted_sampler from TTS.tts.utils.languages import LanguageManager, get_language_weighted_sampler from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text import make_symbols @@ -334,6 +334,9 @@ class BaseTTS(BaseModel): if getattr(config, "use_language_weighted_sampler", False): print(" > Using Language weighted sampler") sampler = get_language_weighted_sampler(dataset.items) + elif getattr(config, "use_speaker_weighted_sampler", False): + print(" > Using Language weighted sampler") + sampler = get_speaker_weighted_sampler(dataset.items) loader = DataLoader( diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 282875af..8ccbdafc 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -431,3 +431,12 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, else: speaker_manager.save_speaker_ids_to_file(out_file_path) return speaker_manager + +def get_speaker_weighted_sampler(items: list): + speaker_names = np.array([item[2] for item in items]) + unique_speaker_names = np.unique(speaker_names).tolist() + speaker_ids = [unique_speaker_names.index(l) for l in speaker_names] + speaker_count = np.array([len(np.where(speaker_names == l)[0]) for l in unique_speaker_names]) + weight_speaker = 1. / speaker_count + dataset_samples_weight = torch.from_numpy(np.array([weight_speaker[l] for l in speaker_ids])).double() + return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight)) \ No newline at end of file From 2b952d8b97d1dca58a92ef8c7bd066d9f66c7205 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Sun, 19 Sep 2021 23:35:31 +0200 Subject: [PATCH 132/220] freeze vits parts --- TTS/tts/models/vits.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 334e4526..c24fec68 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -222,6 +222,9 @@ class VitsArgs(Coqpit): speaker_encoder_config_path: str = "" speaker_encoder_model_path: str = "" fine_tuning_mode: int = 0 + freeze_encoder: bool = False + freeze_DP: bool = False + freeze_PE: bool = False @@ -781,6 +784,20 @@ class Vits(BaseTTS): self.waveform_decoder.train() self.disc.train() + if self.args.freeze_encoder: + for param in self.text_encoder.parameters(): + param.requires_grad = False + for param in self.emb_l.parameters(): + param.requires_grad = False + + if self.args.freeze_PE: + for param in self.posterior_encoder.parameters(): + param.requires_grad = False + + if self.args.freeze_DP: + for param in self.duration_predictor.parameters(): + param.requires_grad = False + if optimizer_idx == 0: text_input = batch["text_input"] text_lengths = batch["text_lengths"] From 1c6bcda950f33de8b09a21436469d00d7e313b7b Mon Sep 17 00:00:00 2001 From: Edresson Date: Sun, 19 Sep 2021 21:06:58 -0300 Subject: [PATCH 133/220] Add freeze vocoder generator and flow-based decoder option --- TTS/tts/models/vits.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index c24fec68..212e7779 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -225,6 +225,8 @@ class VitsArgs(Coqpit): freeze_encoder: bool = False freeze_DP: bool = False freeze_PE: bool = False + freeze_flow_decoder: bool = False + freeze_waveform_decoder: bool = False @@ -787,9 +789,11 @@ class Vits(BaseTTS): if self.args.freeze_encoder: for param in self.text_encoder.parameters(): param.requires_grad = False - for param in self.emb_l.parameters(): - param.requires_grad = False - + + if hasattr(self, 'emb_l'): + for param in self.emb_l.parameters(): + param.requires_grad = False + if self.args.freeze_PE: for param in self.posterior_encoder.parameters(): param.requires_grad = False @@ -798,6 +802,14 @@ class Vits(BaseTTS): for param in self.duration_predictor.parameters(): param.requires_grad = False + if self.args.freeze_flow_decoder: + for param in self.flow.parameters(): + param.requires_grad = False + + if self.args.freeze_waveform_decoder: + for param in self.waveform_decoder.parameters(): + param.requires_grad = False + if optimizer_idx == 0: text_input = batch["text_input"] text_lengths = batch["text_lengths"] From 1bd1a0546b755fedf2afe3bf396b77e51f8d9b74 Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 19 Oct 2021 08:07:48 -0300 Subject: [PATCH 134/220] Add audio resample in the speaker consistency loss --- TTS/tts/models/vits.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 212e7779..f72918a5 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -5,7 +5,7 @@ from itertools import chain from typing import Dict, List, Tuple import torch -import math +import torchaudio from coqpit import Coqpit from torch import nn from torch.cuda.amp.autocast_mode import autocast @@ -159,12 +159,12 @@ class VitsArgs(Coqpit): num_languages (int): Number of languages for the language embedding layer. Defaults to 0. - use_speaker_encoder_as_loss (bool): + use_speaker_encoder_as_loss (bool): Enable/Disable Speaker Consistency Loss (SCL). Defaults to False. speaker_encoder_config_path (str): Path to the file speaker encoder config file, to use for SCL. Defaults to "". - + speaker_encoder_model_path (str): Path to the file speaker encoder checkpoint file, to use for SCL. Defaults to "". @@ -267,6 +267,7 @@ class Vits(BaseTTS): self.END2END = True self.speaker_manager = speaker_manager + self.audio_config = config["audio"] if config.__class__.__name__ == "VitsConfig": # loading from VitsConfig if "num_chars" not in config: @@ -412,7 +413,13 @@ class Vits(BaseTTS): param.requires_grad = False print(" > External Speaker Encoder Loaded !!") + + if hasattr(self.speaker_encoder, "audio_config") and self.audio_config["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"]: + self.audio_transform = torchaudio.transforms.Resample(orig_freq=self.audio_config["sample_rate"], new_freq=self.speaker_encoder.audio_config["sample_rate"]) + else: + self.audio_transform = None else: + self.audio_transform = None self.speaker_encoder = None def init_multilingual(self, config: Coqpit, data: List = None): @@ -560,9 +567,14 @@ class Vits(BaseTTS): self.args.spec_segment_size * self.config.audio.hop_length, ) - if self.args.use_speaker_encoder_as_loss: + if self.args.use_speaker_encoder_as_loss and self.speaker_encoder is not None: # concate generated and GT waveforms wavs_batch = torch.cat((wav_seg, o), dim=0).squeeze(1) + + # resample audio to speaker encoder sample_rate + if self.audio_transform is not None: + wavs_batch = self.audio_transform(wavs_batch) + pred_embs = self.speaker_encoder.forward(wavs_batch, l2_norm=True) # split generated and GT speaker embeddings @@ -671,9 +683,14 @@ class Vits(BaseTTS): self.args.spec_segment_size * self.config.audio.hop_length, ) - if self.args.use_speaker_encoder_as_loss: + if self.args.use_speaker_encoder_as_loss and self.speaker_encoder is not None: # concate generated and GT waveforms wavs_batch = torch.cat((wav_seg, o), dim=0).squeeze(1) + + # resample audio to speaker encoder sample_rate + if self.audio_transform is not None: + wavs_batch = self.audio_transform(wavs_batch) + pred_embs = self.speaker_encoder.forward(wavs_batch, l2_norm=True) # split generated and GT speaker embeddings From 10ff90d6d27c0161e1abc3c70dc5861b17131d47 Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 26 Oct 2021 11:35:18 -0300 Subject: [PATCH 135/220] Add remove silence VAD script --- TTS/bin/remove_silence_using_vad.py | 213 ++++++++++++++++++++++++++++ requirements.txt | 1 + 2 files changed, 214 insertions(+) create mode 100755 TTS/bin/remove_silence_using_vad.py diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py new file mode 100755 index 00000000..c7541cc8 --- /dev/null +++ b/TTS/bin/remove_silence_using_vad.py @@ -0,0 +1,213 @@ +# This code is adpated from: https://github.com/wiseman/py-webrtcvad/blob/master/example.py +import os +import tqdm +import glob +import argparse +import pathlib + +import collections +import contextlib +import sys +import wave +import numpy as np +import webrtcvad +from tqdm.contrib.concurrent import process_map +import multiprocessing +from itertools import chain + +def read_wave(path): + """Reads a .wav file. + + Takes the path, and returns (PCM audio data, sample rate). + """ + with contextlib.closing(wave.open(path, 'rb')) as wf: + num_channels = wf.getnchannels() + assert num_channels == 1 + sample_width = wf.getsampwidth() + assert sample_width == 2 + sample_rate = wf.getframerate() + assert sample_rate in (8000, 16000, 32000, 48000) + pcm_data = wf.readframes(wf.getnframes()) + return pcm_data, sample_rate + + +def write_wave(path, audio, sample_rate): + """Writes a .wav file. + + Takes path, PCM audio data, and sample rate. + """ + with contextlib.closing(wave.open(path, 'wb')) as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(sample_rate) + wf.writeframes(audio) + + +class Frame(object): + """Represents a "frame" of audio data.""" + def __init__(self, bytes, timestamp, duration): + self.bytes = bytes + self.timestamp = timestamp + self.duration = duration + + +def frame_generator(frame_duration_ms, audio, sample_rate): + """Generates audio frames from PCM audio data. + + Takes the desired frame duration in milliseconds, the PCM data, and + the sample rate. + + Yields Frames of the requested duration. + """ + n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) + offset = 0 + timestamp = 0.0 + duration = (float(n) / sample_rate) / 2.0 + while offset + n < len(audio): + yield Frame(audio[offset:offset + n], timestamp, duration) + timestamp += duration + offset += n + + +def vad_collector(sample_rate, frame_duration_ms, + padding_duration_ms, vad, frames): + """Filters out non-voiced audio frames. + + Given a webrtcvad.Vad and a source of audio frames, yields only + the voiced audio. + + Uses a padded, sliding window algorithm over the audio frames. + When more than 90% of the frames in the window are voiced (as + reported by the VAD), the collector triggers and begins yielding + audio frames. Then the collector waits until 90% of the frames in + the window are unvoiced to detrigger. + + The window is padded at the front and back to provide a small + amount of silence or the beginnings/endings of speech around the + voiced frames. + + Arguments: + + sample_rate - The audio sample rate, in Hz. + frame_duration_ms - The frame duration in milliseconds. + padding_duration_ms - The amount to pad the window, in milliseconds. + vad - An instance of webrtcvad.Vad. + frames - a source of audio frames (sequence or generator). + + Returns: A generator that yields PCM audio data. + """ + num_padding_frames = int(padding_duration_ms / frame_duration_ms) + # We use a deque for our sliding window/ring buffer. + ring_buffer = collections.deque(maxlen=num_padding_frames) + # We have two states: TRIGGERED and NOTTRIGGERED. We start in the + # NOTTRIGGERED state. + triggered = False + + voiced_frames = [] + for frame in frames: + is_speech = vad.is_speech(frame.bytes, sample_rate) + + # sys.stdout.write('1' if is_speech else '0') + if not triggered: + ring_buffer.append((frame, is_speech)) + num_voiced = len([f for f, speech in ring_buffer if speech]) + # If we're NOTTRIGGERED and more than 90% of the frames in + # the ring buffer are voiced frames, then enter the + # TRIGGERED state. + if num_voiced > 0.9 * ring_buffer.maxlen: + triggered = True + # sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,)) + # We want to yield all the audio we see from now until + # we are NOTTRIGGERED, but we have to start with the + # audio that's already in the ring buffer. + for f, s in ring_buffer: + voiced_frames.append(f) + ring_buffer.clear() + else: + # We're in the TRIGGERED state, so collect the audio data + # and add it to the ring buffer. + voiced_frames.append(frame) + ring_buffer.append((frame, is_speech)) + num_unvoiced = len([f for f, speech in ring_buffer if not speech]) + # If more than 90% of the frames in the ring buffer are + # unvoiced, then enter NOTTRIGGERED and yield whatever + # audio we've collected. + if num_unvoiced > 0.9 * ring_buffer.maxlen: + #sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration)) + triggered = False + yield b''.join([f.bytes for f in voiced_frames]) + ring_buffer.clear() + voiced_frames = [] + # If we have any leftover voiced audio when we run out of input, + # yield it. + if voiced_frames: + yield b''.join([f.bytes for f in voiced_frames]) + +def remove_silence(filepath): + filename = os.path.basename(filepath) + output_path = filepath.replace(os.path.join(args.input_dir, ''),os.path.join(args.output_dir, '')) + # ignore if the file exists + if os.path.exists(output_path) and not args.force: + return False + # create all directory structure + pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True) + padding_duration_ms = 300 # default 300 + audio, sample_rate = read_wave(filepath) + vad = webrtcvad.Vad(int(args.aggressiveness)) + frames = frame_generator(30, audio, sample_rate) + frames = list(frames) + segments = vad_collector(sample_rate, 30, padding_duration_ms, vad, frames) + flag = False + segments = list(segments) + num_segments = len(segments) + + if num_segments != 0: + for i, segment in reversed(list(enumerate(segments))): + if i >= 1: + if flag == False: + concat_segment = segment + flag = True + else: + concat_segment = segment + concat_segment + else: + if flag: + segment = segment + concat_segment + write_wave(output_path, segment, sample_rate) + print(output_path) + return True + else: + print("> Just Copying the file to:", output_path) + # if fail to remove silence just write the file + write_wave(output_path, audio, sample_rate) + +def preprocess_audios(): + files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True)) + print("> Number of files: ", len(files)) + if not args.force: + print("> Ignoring files that already exist in the output directory.") + + if files: + # create threads + num_threads = multiprocessing.cpu_count() + process_map(remove_silence, files, max_workers=num_threads, chunksize=15) + else: + print("> No files Found !") + +if __name__ == "__main__": + """ + usage + python remove_silence.py -i=VCTK-Corpus-bk/ -o=../VCTK-Corpus-removed-silence -g=wav48/*/*.wav -a=2 + """ + parser = argparse.ArgumentParser() + parser.add_argument('-i', '--input_dir', type=str, default='../VCTK-Corpus', + help='Dataset root dir') + parser.add_argument('-o', '--output_dir', type=str, default='../VCTK-Corpus-removed-silence', + help='Output Dataset dir') + parser.add_argument('-f', '--force', type=bool, default=True, + help='Force the replace of exists files') + parser.add_argument('-g', '--glob', type=str, default='**/*.wav', + help='path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav') + parser.add_argument('-a', '--aggressiveness', type=int, default=2, + help='set its aggressiveness mode, which is an integer between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive.') + args = parser.parse_args() + preprocess_audios() diff --git a/requirements.txt b/requirements.txt index 3ec33ceb..140cf743 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,3 +26,4 @@ unidic-lite==1.0.8 gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=2.0.0 fsspec>=2021.04.0 pyworld +webrtcvad \ No newline at end of file From b3abd01793f3370ac47a585190f4b4f8ace8cc3c Mon Sep 17 00:00:00 2001 From: Julian WEBER Date: Wed, 27 Oct 2021 11:54:05 +0200 Subject: [PATCH 136/220] Merge dataset --- TTS/tts/datasets/dataset.py | 123 ------------------------------------ 1 file changed, 123 deletions(-) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 78c6c33d..ccfa70f1 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -56,10 +56,6 @@ class TTSDataset(Dataset): meta_data (list): List of dataset instances. - compute_f0 (bool): compute f0 if True. Defaults to False. - - f0_cache_path (str): Path to store f0 cache. Defaults to None. - characters (dict): `dict` of custom text characters used for converting texts to sequences. custom_symbols (list): List of custom symbols used for converting texts to sequences. Models using its own @@ -109,8 +105,6 @@ class TTSDataset(Dataset): self.cleaners = text_cleaner self.compute_linear_spec = compute_linear_spec self.return_wav = return_wav - self.compute_f0 = compute_f0 - self.f0_cache_path = f0_cache_path self.min_seq_len = min_seq_len self.max_seq_len = max_seq_len self.ap = ap @@ -339,7 +333,6 @@ class TTSDataset(Dataset): else: lengths = np.array([len(ins[0]) for ins in self.items]) - # sort items based on the sequence length in ascending order idxs = np.argsort(lengths) new_items = [] ignored = [] @@ -349,10 +342,7 @@ class TTSDataset(Dataset): ignored.append(idx) else: new_items.append(self.items[idx]) - # shuffle batch groups - # create batches with similar length items - # the larger the `batch_group_size`, the higher the length variety in a batch. if self.batch_group_size > 0: for i in range(len(new_items) // self.batch_group_size): offset = i * self.batch_group_size @@ -360,14 +350,8 @@ class TTSDataset(Dataset): temp_items = new_items[offset:end_offset] random.shuffle(temp_items) new_items[offset:end_offset] = temp_items - - if len(new_items) == 0: - raise RuntimeError(" [!] No items left after filtering.") - - # update items to the new sorted items self.items = new_items - # logging if self.verbose: print(" | > Max length sequence: {}".format(np.max(lengths))) print(" | > Min length sequence: {}".format(np.min(lengths))) @@ -554,110 +538,3 @@ class TTSDataset(Dataset): ) ) ) - - -class PitchExtractor: - """Pitch Extractor for computing F0 from wav files. - - Args: - items (List[List]): Dataset samples. - verbose (bool): Whether to print the progress. - """ - - def __init__( - self, - items: List[List], - verbose=False, - ): - self.items = items - self.verbose = verbose - self.mean = None - self.std = None - - @staticmethod - def create_pitch_file_path(wav_file, cache_path): - file_name = os.path.splitext(os.path.basename(wav_file))[0] - pitch_file = os.path.join(cache_path, file_name + "_pitch.npy") - return pitch_file - - @staticmethod - def _compute_and_save_pitch(ap, wav_file, pitch_file=None): - wav = ap.load_wav(wav_file) - pitch = ap.compute_f0(wav) - if pitch_file: - np.save(pitch_file, pitch) - return pitch - - @staticmethod - def compute_pitch_stats(pitch_vecs): - nonzeros = np.concatenate([v[np.where(v != 0.0)[0]] for v in pitch_vecs]) - mean, std = np.mean(nonzeros), np.std(nonzeros) - return mean, std - - def normalize_pitch(self, pitch): - zero_idxs = np.where(pitch == 0.0)[0] - pitch = pitch - self.mean - pitch = pitch / self.std - pitch[zero_idxs] = 0.0 - return pitch - - def denormalize_pitch(self, pitch): - zero_idxs = np.where(pitch == 0.0)[0] - pitch *= self.std - pitch += self.mean - pitch[zero_idxs] = 0.0 - return pitch - - @staticmethod - def load_or_compute_pitch(ap, wav_file, cache_path): - """ - compute pitch and return a numpy array of pitch values - """ - pitch_file = PitchExtractor.create_pitch_file_path(wav_file, cache_path) - if not os.path.exists(pitch_file): - pitch = PitchExtractor._compute_and_save_pitch(ap, wav_file, pitch_file) - else: - pitch = np.load(pitch_file) - return pitch.astype(np.float32) - - @staticmethod - def _pitch_worker(args): - item = args[0] - ap = args[1] - cache_path = args[2] - _, wav_file, *_ = item - pitch_file = PitchExtractor.create_pitch_file_path(wav_file, cache_path) - if not os.path.exists(pitch_file): - pitch = PitchExtractor._compute_and_save_pitch(ap, wav_file, pitch_file) - return pitch - return None - - def compute_pitch(self, ap, cache_path, num_workers=0): - """Compute the input sequences with multi-processing. - Call it before passing dataset to the data loader to cache the input sequences for faster data loading.""" - if not os.path.exists(cache_path): - os.makedirs(cache_path, exist_ok=True) - - if self.verbose: - print(" | > Computing pitch features ...") - if num_workers == 0: - pitch_vecs = [] - for _, item in enumerate(tqdm.tqdm(self.items)): - pitch_vecs += [self._pitch_worker([item, ap, cache_path])] - else: - with Pool(num_workers) as p: - pitch_vecs = list( - tqdm.tqdm( - p.imap(PitchExtractor._pitch_worker, [[item, ap, cache_path] for item in self.items]), - total=len(self.items), - ) - ) - pitch_mean, pitch_std = self.compute_pitch_stats(pitch_vecs) - pitch_stats = {"mean": pitch_mean, "std": pitch_std} - np.save(os.path.join(cache_path, "pitch_stats"), pitch_stats, allow_pickle=True) - - def load_pitch_stats(self, cache_path): - stats_path = os.path.join(cache_path, "pitch_stats.npy") - stats = np.load(stats_path, allow_pickle=True).item() - self.mean = stats["mean"].astype(np.float32) - self.std = stats["std"].astype(np.float32) From 9a2f91327c22aa33b4a976cbb8ef6993eb81e70c Mon Sep 17 00:00:00 2001 From: Julian WEBER Date: Wed, 27 Oct 2021 12:02:02 +0200 Subject: [PATCH 137/220] get_aux_input --- TTS/tts/models/vits.py | 64 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index f72918a5..078d4973 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -387,6 +387,25 @@ class Vits(BaseTTS): if config.use_d_vector_file: self._init_d_vector(config) + # TODO: make this a function + if config.use_speaker_encoder_as_loss: + if not config.speaker_encoder_model_path or not config.speaker_encoder_config_path: + raise RuntimeError(" [!] To use the speaker encoder loss you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!") + self.speaker_manager.init_speaker_encoder(config.speaker_encoder_model_path, config.speaker_encoder_config_path) + self.speaker_encoder = self.speaker_manager.speaker_encoder.train() + for param in self.speaker_encoder.parameters(): + param.requires_grad = False + + print(" > External Speaker Encoder Loaded !!") + + if hasattr(self.speaker_encoder, "audio_config") and self.audio_config["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"]: + self.audio_transform = torchaudio.transforms.Resample(orig_freq=self.audio_config["sample_rate"], new_freq=self.speaker_encoder.audio_config["sample_rate"]) + else: + self.audio_transform = None + else: + self.audio_transform = None + self.speaker_encoder = None + def _init_speaker_embedding(self, config): # pylint: disable=attribute-defined-outside-init if config.speakers_file is not None: @@ -469,8 +488,49 @@ class Vits(BaseTTS): return sid, g, lid def get_aux_input(self, aux_input: Dict): - sid, g = self._set_cond_input(aux_input) - return {"speaker_id": sid, "style_wav": None, "d_vector": g} + sid, g, lid = self._set_cond_input(aux_input) + return {"speaker_id": sid, "style_wav": None, "d_vector": g, "language_id": lid} + + def get_aux_input_from_test_setences(self, sentence_info): + if hasattr(self.config, "model_args"): + config = self.config.model_args + else: + config = self.config + + # extract speaker and language info + text, speaker_name, style_wav, language_name = None, None, None, None + + if isinstance(sentence_info, list): + if len(sentence_info) == 1: + text = sentence_info[0] + elif len(sentence_info) == 2: + text, speaker_name = sentence_info + elif len(sentence_info) == 3: + text, speaker_name, style_wav = sentence_info + elif len(sentence_info) == 4: + text, speaker_name, style_wav, language_name = sentence_info + else: + text = sentence_info + + # get speaker id/d_vector + speaker_id, d_vector, language_id = None, None, None + if hasattr(self, "speaker_manager"): + if config.use_d_vector_file: + if speaker_name is None: + d_vector = self.speaker_manager.get_random_d_vector() + else: + d_vector = self.speaker_manager.get_d_vector_by_speaker(speaker_name) + elif config.use_speaker_embedding: + if speaker_name is None: + speaker_id = self.speaker_manager.get_random_speaker_id() + else: + speaker_id = self.speaker_manager.speaker_ids[speaker_name] + + # get language id + if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None: + language_id = self.language_manager.language_id_mapping[language_name] + + return {"text": text, "speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector, "language_id": language_id} def forward( self, From 78c2d12a91704cb6bcb4746755df0097d395c06c Mon Sep 17 00:00:00 2001 From: Julian WEBER Date: Wed, 27 Oct 2021 13:40:11 +0200 Subject: [PATCH 138/220] PitchExtractor --- TTS/tts/datasets/dataset.py | 106 ++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index ccfa70f1..635ffb38 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -105,6 +105,7 @@ class TTSDataset(Dataset): self.cleaners = text_cleaner self.compute_linear_spec = compute_linear_spec self.return_wav = return_wav + self.compute_f0 = compute_f0 self.min_seq_len = min_seq_len self.max_seq_len = max_seq_len self.ap = ap @@ -538,3 +539,108 @@ class TTSDataset(Dataset): ) ) ) + +class PitchExtractor: + """Pitch Extractor for computing F0 from wav files. + Args: + items (List[List]): Dataset samples. + verbose (bool): Whether to print the progress. + """ + + def __init__( + self, + items: List[List], + verbose=False, + ): + self.items = items + self.verbose = verbose + self.mean = None + self.std = None + + @staticmethod + def create_pitch_file_path(wav_file, cache_path): + file_name = os.path.splitext(os.path.basename(wav_file))[0] + pitch_file = os.path.join(cache_path, file_name + "_pitch.npy") + return pitch_file + + @staticmethod + def _compute_and_save_pitch(ap, wav_file, pitch_file=None): + wav = ap.load_wav(wav_file) + pitch = ap.compute_f0(wav) + if pitch_file: + np.save(pitch_file, pitch) + return pitch + + @staticmethod + def compute_pitch_stats(pitch_vecs): + nonzeros = np.concatenate([v[np.where(v != 0.0)[0]] for v in pitch_vecs]) + mean, std = np.mean(nonzeros), np.std(nonzeros) + return mean, std + + def normalize_pitch(self, pitch): + zero_idxs = np.where(pitch == 0.0)[0] + pitch = pitch - self.mean + pitch = pitch / self.std + pitch[zero_idxs] = 0.0 + return pitch + + def denormalize_pitch(self, pitch): + zero_idxs = np.where(pitch == 0.0)[0] + pitch *= self.std + pitch += self.mean + pitch[zero_idxs] = 0.0 + return pitch + + @staticmethod + def load_or_compute_pitch(ap, wav_file, cache_path): + """ + compute pitch and return a numpy array of pitch values + """ + pitch_file = PitchExtractor.create_pitch_file_path(wav_file, cache_path) + if not os.path.exists(pitch_file): + pitch = PitchExtractor._compute_and_save_pitch(ap, wav_file, pitch_file) + else: + pitch = np.load(pitch_file) + return pitch.astype(np.float32) + + @staticmethod + def _pitch_worker(args): + item = args[0] + ap = args[1] + cache_path = args[2] + _, wav_file, *_ = item + pitch_file = PitchExtractor.create_pitch_file_path(wav_file, cache_path) + if not os.path.exists(pitch_file): + pitch = PitchExtractor._compute_and_save_pitch(ap, wav_file, pitch_file) + return pitch + return None + + def compute_pitch(self, ap, cache_path, num_workers=0): + """Compute the input sequences with multi-processing. + Call it before passing dataset to the data loader to cache the input sequences for faster data loading.""" + if not os.path.exists(cache_path): + os.makedirs(cache_path, exist_ok=True) + + if self.verbose: + print(" | > Computing pitch features ...") + if num_workers == 0: + pitch_vecs = [] + for _, item in enumerate(tqdm.tqdm(self.items)): + pitch_vecs += [self._pitch_worker([item, ap, cache_path])] + else: + with Pool(num_workers) as p: + pitch_vecs = list( + tqdm.tqdm( + p.imap(PitchExtractor._pitch_worker, [[item, ap, cache_path] for item in self.items]), + total=len(self.items), + ) + ) + pitch_mean, pitch_std = self.compute_pitch_stats(pitch_vecs) + pitch_stats = {"mean": pitch_mean, "std": pitch_std} + np.save(os.path.join(cache_path, "pitch_stats"), pitch_stats, allow_pickle=True) + + def load_pitch_stats(self, cache_path): + stats_path = os.path.join(cache_path, "pitch_stats.npy") + stats = np.load(stats_path, allow_pickle=True).item() + self.mean = stats["mean"].astype(np.float32) + self.std = stats["std"].astype(np.float32) \ No newline at end of file From 2a2b5767c2671b625108a5dc60b7bb815e3c1135 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 27 Oct 2021 13:45:49 +0200 Subject: [PATCH 139/220] fix collate_fn --- TTS/tts/datasets/dataset.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 635ffb38..38af1469 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -402,8 +402,6 @@ class TTSDataset(Dataset): # convert list of dicts to dict of lists batch = {k: [dic[k] for dic in batch] for k in batch[0]} - speaker_names = [batch[idx]["speaker_name"] for idx in ids_sorted_decreasing] - # get language ids from language names if self.language_id_mapping is not None: language_names = [batch[idx]["language_name"] for idx in ids_sorted_decreasing] From 3b5592abcf41fffcb0c17858167bbd9228fbd970 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Fri, 29 Oct 2021 17:09:10 +0200 Subject: [PATCH 140/220] fix test vits --- TTS/trainer.py | 2 +- TTS/tts/configs/vits_config.py | 22 +++---------------- TTS/tts/datasets/dataset.py | 3 +-- TTS/tts/models/base_tts.py | 9 ++++++-- TTS/tts/models/vits.py | 5 +---- tests/tts_tests/test_vits_d-vectors_train.py | 3 +-- .../tts_tests/test_vits_multilingual_train.py | 3 ++- .../tts_tests/test_vits_speaker_emb_train.py | 2 +- 8 files changed, 17 insertions(+), 32 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index e8911ba3..665f2589 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -261,7 +261,7 @@ class Trainer: self.run_get_model(self.config, get_model) if hasattr(self.model, "init_multilingual"): - self.model.init_multilingual(self.config, self.data_train + self.data_eval) + self.model.init_multilingual(self.config, self.train_samples + self.eval_samples) config = self.config.model_args if hasattr(self.config, "model_args") else self.config # save speakers json if config.use_language_embedding and self.model.language_manager.num_languages > 1: diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index ece414a6..a6f2210d 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -154,22 +154,6 @@ class VitsConfig(BaseTTSConfig): d_vector_dim: int = None def __post_init__(self): - # Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there. - if self.num_speakers > 0: - self.model_args.num_speakers = self.num_speakers - - # speaker embedding settings - if self.use_speaker_embedding: - self.model_args.use_speaker_embedding = True - if self.speakers_file: - self.model_args.speakers_file = self.speakers_file - if self.speaker_embedding_channels: - self.model_args.speaker_embedding_channels = self.speaker_embedding_channels - - # d-vector settings - if self.use_d_vector_file: - self.model_args.use_d_vector_file = True - if self.d_vector_dim is not None and self.d_vector_dim > 0: - self.model_args.d_vector_dim = self.d_vector_dim - if self.d_vector_file: - self.model_args.d_vector_file = self.d_vector_file + for key in self.model_args.keys(): + if hasattr(self, key): + self[key] = self.model_args[key] diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 38af1469..c2818897 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -404,8 +404,7 @@ class TTSDataset(Dataset): # get language ids from language names if self.language_id_mapping is not None: - language_names = [batch[idx]["language_name"] for idx in ids_sorted_decreasing] - language_ids = [self.language_id_mapping[ln] for ln in language_names] + language_ids = [self.language_id_mapping[ln] for ln in batch["language_name"]] else: language_ids = None # get pre-computed d-vectors diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 9d722222..df6c52f3 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -245,8 +245,13 @@ class BaseTTS(BaseModel): # setup multi-speaker attributes if hasattr(self, "speaker_manager") and self.speaker_manager is not None: - speaker_id_mapping = self.speaker_manager.speaker_ids if config.use_speaker_embedding else None - d_vector_mapping = self.speaker_manager.d_vectors if config.use_d_vector_file else None + if hasattr(config, "model_args"): + speaker_id_mapping = self.speaker_manager.speaker_ids if config.model_args.use_speaker_embedding else None + d_vector_mapping = self.speaker_manager.d_vectors if config.model_args.use_d_vector_file else None + config.use_d_vector_file = config.model_args.use_d_vector_file + else: + speaker_id_mapping = self.speaker_manager.speaker_ids if config.use_speaker_embedding else None + d_vector_mapping = self.speaker_manager.d_vectors if config.use_d_vector_file else None else: speaker_id_mapping = None d_vector_mapping = None diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 078d4973..bc503cb5 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -376,8 +376,7 @@ class Vits(BaseTTS): data (List, optional): Dataset items to infer number of speakers. Defaults to None. """ self.embedded_speaker_dim = 0 - if hasattr(config, "model_args"): - config = config.model_args + config = config.model_args self.num_speakers = config.num_speakers @@ -1033,7 +1032,6 @@ class Vits(BaseTTS): test_audios = {} test_figures = {} test_sentences = self.config.test_sentences - for idx, s_info in enumerate(test_sentences): try: aux_inputs = self.get_aux_input_from_test_setences(s_info) @@ -1051,7 +1049,6 @@ class Vits(BaseTTS): use_griffin_lim=True, do_trim_silence=False, ).values() - test_audios["{}-audio".format(idx)] = wav test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False) except: # pylint: disable=bare-except diff --git a/tests/tts_tests/test_vits_d-vectors_train.py b/tests/tts_tests/test_vits_d-vectors_train.py index af0e0eba..213669f5 100644 --- a/tests/tts_tests/test_vits_d-vectors_train.py +++ b/tests/tts_tests/test_vits_d-vectors_train.py @@ -3,7 +3,7 @@ import os import shutil from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs import VitsConfig +from TTS.tts.configs.vits_config import VitsConfig config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") @@ -33,7 +33,6 @@ config.audio.do_trim_silence = True config.audio.trim_db = 60 # active multispeaker d-vec mode -config.model_args.use_speaker_embedding = True config.model_args.use_d_vector_file = True config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json" config.model_args.d_vector_dim = 256 diff --git a/tests/tts_tests/test_vits_multilingual_train.py b/tests/tts_tests/test_vits_multilingual_train.py index 10e66b81..664de57e 100644 --- a/tests/tts_tests/test_vits_multilingual_train.py +++ b/tests/tts_tests/test_vits_multilingual_train.py @@ -3,7 +3,8 @@ import os import shutil from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs import BaseDatasetConfig, VitsConfig +from TTS.tts.configs.vits_config import VitsConfig +from TTS.config.shared_configs import BaseDatasetConfig config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") diff --git a/tests/tts_tests/test_vits_speaker_emb_train.py b/tests/tts_tests/test_vits_speaker_emb_train.py index 7028a983..6cc1dabd 100644 --- a/tests/tts_tests/test_vits_speaker_emb_train.py +++ b/tests/tts_tests/test_vits_speaker_emb_train.py @@ -3,7 +3,7 @@ import os import shutil from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs import VitsConfig +from TTS.tts.configs.vits_config import VitsConfig config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") From 080480672760cf649416a459ae6d06b2e75c1929 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Fri, 29 Oct 2021 19:05:26 +0200 Subject: [PATCH 141/220] fix f0_cache_path in dataset --- TTS/tts/datasets/dataset.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index c2818897..fc51c766 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -56,6 +56,10 @@ class TTSDataset(Dataset): meta_data (list): List of dataset instances. + compute_f0 (bool): compute f0 if True. Defaults to False. + + f0_cache_path (str): Path to store f0 cache. Defaults to None. + characters (dict): `dict` of custom text characters used for converting texts to sequences. custom_symbols (list): List of custom symbols used for converting texts to sequences. Models using its own @@ -106,6 +110,7 @@ class TTSDataset(Dataset): self.compute_linear_spec = compute_linear_spec self.return_wav = return_wav self.compute_f0 = compute_f0 + self.f0_cache_path = f0_cache_path self.min_seq_len = min_seq_len self.max_seq_len = max_seq_len self.ap = ap From 4d721bcabdef0ad56a9b3eb7fa6a35f813b7451b Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 2 Nov 2021 16:33:40 +0100 Subject: [PATCH 142/220] fix test sentence synthesis --- TTS/tts/utils/synthesis.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 63fe92c3..6d998492 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -175,6 +175,7 @@ def embedding_to_torch(d_vector, cuda=False): if d_vector is not None: d_vector = np.asarray(d_vector) d_vector = torch.from_numpy(d_vector).type(torch.FloatTensor) + d_vector = d_vector.squeeze().unsqueeze(0) if cuda: return d_vector.cuda() return d_vector From e22f7a2acaf399058c2e76665646b6e5fdae9064 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 2 Nov 2021 17:30:20 +0100 Subject: [PATCH 143/220] Add torchaudio in requirements.txt --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 140cf743..cf4798b2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,4 +26,5 @@ unidic-lite==1.0.8 gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=2.0.0 fsspec>=2021.04.0 pyworld -webrtcvad \ No newline at end of file +webrtcvad +torchaudio>=0.7 From 1472b6df4980f40374d18a6e9f2f566e72c4f36a Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 2 Nov 2021 17:31:14 +0100 Subject: [PATCH 144/220] make style --- TTS/bin/find_unique_phonemes.py | 14 ++-- TTS/bin/remove_silence_using_vad.py | 71 +++++++++++-------- TTS/speaker_encoder/models/resnet.py | 21 ++++-- TTS/speaker_encoder/utils/generic_utils.py | 6 +- TTS/trainer.py | 4 +- TTS/tts/datasets/dataset.py | 3 +- TTS/tts/datasets/formatters.py | 2 +- TTS/tts/layers/losses.py | 6 +- .../vits/stochastic_duration_predictor.py | 9 ++- TTS/tts/models/base_tts.py | 8 ++- TTS/tts/models/vits.py | 61 ++++++++++------ TTS/tts/utils/languages.py | 18 +++-- TTS/tts/utils/speakers.py | 5 +- TTS/tts/utils/text/cleaners.py | 3 +- .../tts_tests/test_vits_multilingual_train.py | 14 +++- 15 files changed, 158 insertions(+), 87 deletions(-) diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index 7ed79b36..bbc88fb6 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -1,14 +1,15 @@ """Find all the unique characters in a dataset""" import argparse +import multiprocessing from argparse import RawTextHelpFormatter +import numpy +from tqdm.contrib.concurrent import process_map + from TTS.config import load_config from TTS.tts.datasets import load_meta_data - -import numpy -import multiprocessing from TTS.tts.utils.text import text2phone -from tqdm.contrib.concurrent import process_map + def compute_phonemes(item): try: @@ -18,7 +19,8 @@ def compute_phonemes(item): except: return [] return list(set(ph)) - + + def main(): global c # pylint: disable=bad-option-value @@ -51,8 +53,6 @@ def main(): phones_force_lower = [c.lower() for c in phones] phones_force_lower = set(phones_force_lower) - - print(f" > Number of unique phonemes: {len(phones)}") print(f" > Unique phonemes: {''.join(sorted(phones))}") print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}") diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py index c7541cc8..25ae26ef 100755 --- a/TTS/bin/remove_silence_using_vad.py +++ b/TTS/bin/remove_silence_using_vad.py @@ -1,26 +1,27 @@ # This code is adpated from: https://github.com/wiseman/py-webrtcvad/blob/master/example.py -import os -import tqdm -import glob import argparse -import pathlib - import collections import contextlib +import glob +import multiprocessing +import os +import pathlib import sys import wave +from itertools import chain + import numpy as np +import tqdm import webrtcvad from tqdm.contrib.concurrent import process_map -import multiprocessing -from itertools import chain + def read_wave(path): """Reads a .wav file. Takes the path, and returns (PCM audio data, sample rate). """ - with contextlib.closing(wave.open(path, 'rb')) as wf: + with contextlib.closing(wave.open(path, "rb")) as wf: num_channels = wf.getnchannels() assert num_channels == 1 sample_width = wf.getsampwidth() @@ -36,7 +37,7 @@ def write_wave(path, audio, sample_rate): Takes path, PCM audio data, and sample rate. """ - with contextlib.closing(wave.open(path, 'wb')) as wf: + with contextlib.closing(wave.open(path, "wb")) as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(sample_rate) @@ -45,6 +46,7 @@ def write_wave(path, audio, sample_rate): class Frame(object): """Represents a "frame" of audio data.""" + def __init__(self, bytes, timestamp, duration): self.bytes = bytes self.timestamp = timestamp @@ -64,13 +66,12 @@ def frame_generator(frame_duration_ms, audio, sample_rate): timestamp = 0.0 duration = (float(n) / sample_rate) / 2.0 while offset + n < len(audio): - yield Frame(audio[offset:offset + n], timestamp, duration) + yield Frame(audio[offset : offset + n], timestamp, duration) timestamp += duration offset += n -def vad_collector(sample_rate, frame_duration_ms, - padding_duration_ms, vad, frames): +def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames): """Filters out non-voiced audio frames. Given a webrtcvad.Vad and a source of audio frames, yields only @@ -133,25 +134,26 @@ def vad_collector(sample_rate, frame_duration_ms, # unvoiced, then enter NOTTRIGGERED and yield whatever # audio we've collected. if num_unvoiced > 0.9 * ring_buffer.maxlen: - #sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration)) + # sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration)) triggered = False - yield b''.join([f.bytes for f in voiced_frames]) + yield b"".join([f.bytes for f in voiced_frames]) ring_buffer.clear() voiced_frames = [] # If we have any leftover voiced audio when we run out of input, # yield it. if voiced_frames: - yield b''.join([f.bytes for f in voiced_frames]) + yield b"".join([f.bytes for f in voiced_frames]) + def remove_silence(filepath): filename = os.path.basename(filepath) - output_path = filepath.replace(os.path.join(args.input_dir, ''),os.path.join(args.output_dir, '')) - # ignore if the file exists + output_path = filepath.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, "")) + # ignore if the file exists if os.path.exists(output_path) and not args.force: return False # create all directory structure pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True) - padding_duration_ms = 300 # default 300 + padding_duration_ms = 300 # default 300 audio, sample_rate = read_wave(filepath) vad = webrtcvad.Vad(int(args.aggressiveness)) frames = frame_generator(30, audio, sample_rate) @@ -180,6 +182,7 @@ def remove_silence(filepath): # if fail to remove silence just write the file write_wave(output_path, audio, sample_rate) + def preprocess_audios(): files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True)) print("> Number of files: ", len(files)) @@ -193,21 +196,31 @@ def preprocess_audios(): else: print("> No files Found !") + if __name__ == "__main__": """ usage - python remove_silence.py -i=VCTK-Corpus-bk/ -o=../VCTK-Corpus-removed-silence -g=wav48/*/*.wav -a=2 + python remove_silence.py -i=VCTK-Corpus-bk/ -o=../VCTK-Corpus-removed-silence -g=wav48/*/*.wav -a=2 """ parser = argparse.ArgumentParser() - parser.add_argument('-i', '--input_dir', type=str, default='../VCTK-Corpus', - help='Dataset root dir') - parser.add_argument('-o', '--output_dir', type=str, default='../VCTK-Corpus-removed-silence', - help='Output Dataset dir') - parser.add_argument('-f', '--force', type=bool, default=True, - help='Force the replace of exists files') - parser.add_argument('-g', '--glob', type=str, default='**/*.wav', - help='path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav') - parser.add_argument('-a', '--aggressiveness', type=int, default=2, - help='set its aggressiveness mode, which is an integer between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive.') + parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir") + parser.add_argument( + "-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir" + ) + parser.add_argument("-f", "--force", type=bool, default=True, help="Force the replace of exists files") + parser.add_argument( + "-g", + "--glob", + type=str, + default="**/*.wav", + help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav", + ) + parser.add_argument( + "-a", + "--aggressiveness", + type=int, + default=2, + help="set its aggressiveness mode, which is an integer between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive.", + ) args = parser.parse_args() preprocess_audios() diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index beeb5ae1..42f041b4 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -5,20 +5,20 @@ import torch.nn as nn from TTS.utils.io import load_fsspec + class PreEmphasis(torch.nn.Module): def __init__(self, coefficient=0.97): super().__init__() self.coefficient = coefficient - self.register_buffer( - 'filter', torch.FloatTensor([-self.coefficient, 1.]).unsqueeze(0).unsqueeze(0) - ) + self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0)) def forward(self, x): assert len(x.size()) == 2 - x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), 'reflect') + x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect") return torch.nn.functional.conv1d(x, self.filter).squeeze(1) + class SELayer(nn.Module): def __init__(self, channel, reduction=8): super(SELayer, self).__init__() @@ -110,8 +110,15 @@ class ResNetSpeakerEncoder(nn.Module): if self.use_torch_spec: self.torch_spec = torch.nn.Sequential( PreEmphasis(audio_config["preemphasis"]), - torchaudio.transforms.MelSpectrogram(sample_rate=audio_config["sample_rate"], n_fft=audio_config["fft_size"], win_length=audio_config["win_length"], hop_length=audio_config["hop_length"], window_fn=torch.hamming_window, n_mels=audio_config["num_mels"]) - ) + torchaudio.transforms.MelSpectrogram( + sample_rate=audio_config["sample_rate"], + n_fft=audio_config["fft_size"], + win_length=audio_config["win_length"], + hop_length=audio_config["hop_length"], + window_fn=torch.hamming_window, + n_mels=audio_config["num_mels"], + ), + ) else: self.torch_spec = None @@ -213,7 +220,7 @@ class ResNetSpeakerEncoder(nn.Module): """ # map to the waveform size if self.use_torch_spec: - num_frames = num_frames * self.audio_config['hop_length'] + num_frames = num_frames * self.audio_config["hop_length"] max_len = x.shape[1] diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/speaker_encoder/utils/generic_utils.py index 3714e3c4..c926e215 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/speaker_encoder/utils/generic_utils.py @@ -179,10 +179,12 @@ def setup_model(c): c.model_params["num_lstm_layers"], ) elif c.model_params["model_name"].lower() == "resnet": - model = ResNetSpeakerEncoder(input_dim=c.model_params["input_dim"], proj_dim=c.model_params["proj_dim"], + model = ResNetSpeakerEncoder( + input_dim=c.model_params["input_dim"], + proj_dim=c.model_params["proj_dim"], log_input=c.model_params.get("log_input", False), use_torch_spec=c.model_params.get("use_torch_spec", False), - audio_config=c.audio + audio_config=c.audio, ) return model diff --git a/TTS/trainer.py b/TTS/trainer.py index 665f2589..c151e716 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -265,7 +265,9 @@ class Trainer: config = self.config.model_args if hasattr(self.config, "model_args") else self.config # save speakers json if config.use_language_embedding and self.model.language_manager.num_languages > 1: - self.model.language_manager.save_language_ids_to_file(os.path.join(self.output_path, "language_ids.json")) + self.model.language_manager.save_language_ids_to_file( + os.path.join(self.output_path, "language_ids.json") + ) if hasattr(self.config, "model_args"): self.config.model_args["num_languages"] = self.model.language_manager.num_languages else: diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index fc51c766..6d177743 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -542,6 +542,7 @@ class TTSDataset(Dataset): ) ) + class PitchExtractor: """Pitch Extractor for computing F0 from wav files. Args: @@ -645,4 +646,4 @@ class PitchExtractor: stats_path = os.path.join(cache_path, "pitch_stats.npy") stats = np.load(stats_path, allow_pickle=True).item() self.mean = stats["mean"].astype(np.float32) - self.std = stats["std"].astype(np.float32) \ No newline at end of file + self.std = stats["std"].astype(np.float32) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 651b3197..7e65f21a 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -304,7 +304,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): return items -def vctk_slim(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): # pylint: disable=unused-argument +def vctk_slim(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): # pylint: disable=unused-argument """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" items = [] txt_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index cd2903b0..93a5bad2 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -602,7 +602,7 @@ class VitsGeneratorLoss(nn.Module): fine_tuning_mode=0, use_speaker_encoder_as_loss=False, gt_spk_emb=None, - syn_spk_emb=None + syn_spk_emb=None, ): """ Shapes: @@ -638,7 +638,9 @@ class VitsGeneratorLoss(nn.Module): loss = loss_kl + loss_feat + loss_mel + loss_gen + loss_duration if use_speaker_encoder_as_loss: - loss_se = - torch.nn.functional.cosine_similarity(gt_spk_emb, syn_spk_emb).mean() * self.spk_encoder_loss_alpha + loss_se = ( + -torch.nn.functional.cosine_similarity(gt_spk_emb, syn_spk_emb).mean() * self.spk_encoder_loss_alpha + ) loss += loss_se return_dict["loss_spk_encoder"] = loss_se diff --git a/TTS/tts/layers/vits/stochastic_duration_predictor.py b/TTS/tts/layers/vits/stochastic_duration_predictor.py index 8ec7c866..7c25156a 100644 --- a/TTS/tts/layers/vits/stochastic_duration_predictor.py +++ b/TTS/tts/layers/vits/stochastic_duration_predictor.py @@ -178,7 +178,14 @@ class StochasticDurationPredictor(nn.Module): """ def __init__( - self, in_channels: int, hidden_channels: int, kernel_size: int, dropout_p: float, num_flows=4, cond_channels=0, language_emb_dim=None + self, + in_channels: int, + hidden_channels: int, + kernel_size: int, + dropout_p: float, + num_flows=4, + cond_channels=0, + language_emb_dim=None, ): super().__init__() diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index df6c52f3..de00f6c7 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -246,7 +246,9 @@ class BaseTTS(BaseModel): # setup multi-speaker attributes if hasattr(self, "speaker_manager") and self.speaker_manager is not None: if hasattr(config, "model_args"): - speaker_id_mapping = self.speaker_manager.speaker_ids if config.model_args.use_speaker_embedding else None + speaker_id_mapping = ( + self.speaker_manager.speaker_ids if config.model_args.use_speaker_embedding else None + ) d_vector_mapping = self.speaker_manager.d_vectors if config.model_args.use_d_vector_file else None config.use_d_vector_file = config.model_args.use_d_vector_file else: @@ -262,7 +264,9 @@ class BaseTTS(BaseModel): custom_symbols = self.make_symbols(self.config) if hasattr(self, "language_manager"): - language_id_mapping = self.language_manager.language_id_mapping if self.args.use_language_embedding else None + language_id_mapping = ( + self.language_manager.language_id_mapping if self.args.use_language_embedding else None + ) else: language_id_mapping = None diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index bc503cb5..c185150b 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -229,7 +229,6 @@ class VitsArgs(Coqpit): freeze_waveform_decoder: bool = False - class Vits(BaseTTS): """VITS TTS model @@ -306,7 +305,7 @@ class Vits(BaseTTS): args.num_layers_text_encoder, args.kernel_size_text_encoder, args.dropout_p_text_encoder, - language_emb_dim=self.embedded_language_dim + language_emb_dim=self.embedded_language_dim, ) self.posterior_encoder = PosteriorEncoder( @@ -389,16 +388,26 @@ class Vits(BaseTTS): # TODO: make this a function if config.use_speaker_encoder_as_loss: if not config.speaker_encoder_model_path or not config.speaker_encoder_config_path: - raise RuntimeError(" [!] To use the speaker encoder loss you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!") - self.speaker_manager.init_speaker_encoder(config.speaker_encoder_model_path, config.speaker_encoder_config_path) + raise RuntimeError( + " [!] To use the speaker encoder loss you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!" + ) + self.speaker_manager.init_speaker_encoder( + config.speaker_encoder_model_path, config.speaker_encoder_config_path + ) self.speaker_encoder = self.speaker_manager.speaker_encoder.train() for param in self.speaker_encoder.parameters(): param.requires_grad = False print(" > External Speaker Encoder Loaded !!") - if hasattr(self.speaker_encoder, "audio_config") and self.audio_config["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"]: - self.audio_transform = torchaudio.transforms.Resample(orig_freq=self.audio_config["sample_rate"], new_freq=self.speaker_encoder.audio_config["sample_rate"]) + if ( + hasattr(self.speaker_encoder, "audio_config") + and self.audio_config["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"] + ): + self.audio_transform = torchaudio.transforms.Resample( + orig_freq=self.audio_config["sample_rate"], + new_freq=self.speaker_encoder.audio_config["sample_rate"], + ) else: self.audio_transform = None else: @@ -529,7 +538,13 @@ class Vits(BaseTTS): if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None: language_id = self.language_manager.language_id_mapping[language_name] - return {"text": text, "speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector, "language_id": language_id} + return { + "text": text, + "speaker_id": speaker_id, + "style_wav": style_wav, + "d_vector": d_vector, + "language_id": language_id, + } def forward( self, @@ -567,7 +582,7 @@ class Vits(BaseTTS): g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] # language embedding - lang_emb=None + lang_emb = None if self.args.use_language_embedding and lid is not None: lang_emb = self.emb_l(lid).unsqueeze(-1) @@ -621,9 +636,9 @@ class Vits(BaseTTS): o = self.waveform_decoder(z_slice, g=g) wav_seg = segment( - waveform.transpose(1, 2), - slice_ids * self.config.audio.hop_length, - self.args.spec_segment_size * self.config.audio.hop_length, + waveform.transpose(1, 2), + slice_ids * self.config.audio.hop_length, + self.args.spec_segment_size * self.config.audio.hop_length, ) if self.args.use_speaker_encoder_as_loss and self.speaker_encoder is not None: @@ -653,7 +668,7 @@ class Vits(BaseTTS): "logs_q": logs_q, "waveform_seg": wav_seg, "gt_spk_emb": gt_spk_emb, - "syn_spk_emb": syn_spk_emb + "syn_spk_emb": syn_spk_emb, } ) return outputs @@ -695,7 +710,7 @@ class Vits(BaseTTS): g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] # language embedding - lang_emb=None + lang_emb = None if self.args.use_language_embedding and lid is not None: lang_emb = self.emb_l(lid).unsqueeze(-1) @@ -737,9 +752,9 @@ class Vits(BaseTTS): o = self.waveform_decoder(z_slice, g=g) wav_seg = segment( - waveform.transpose(1, 2), - slice_ids * self.config.audio.hop_length, - self.args.spec_segment_size * self.config.audio.hop_length, + waveform.transpose(1, 2), + slice_ids * self.config.audio.hop_length, + self.args.spec_segment_size * self.config.audio.hop_length, ) if self.args.use_speaker_encoder_as_loss and self.speaker_encoder is not None: @@ -770,7 +785,7 @@ class Vits(BaseTTS): "logs_q": logs_q, "waveform_seg": wav_seg, "gt_spk_emb": gt_spk_emb, - "syn_spk_emb": syn_spk_emb + "syn_spk_emb": syn_spk_emb, } ) return outputs @@ -790,14 +805,16 @@ class Vits(BaseTTS): g = self.emb_g(sid).unsqueeze(-1) # language embedding - lang_emb=None + lang_emb = None if self.args.use_language_embedding and lid is not None: lang_emb = self.emb_l(lid).unsqueeze(-1) x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb) if self.args.use_sdp: - logw = self.duration_predictor(x, x_mask, g=g, reverse=True, noise_scale=self.inference_noise_scale_dp, lang_emb=lang_emb) + logw = self.duration_predictor( + x, x_mask, g=g, reverse=True, noise_scale=self.inference_noise_scale_dp, lang_emb=lang_emb + ) else: logw = self.duration_predictor(x, x_mask, g=g, lang_emb=lang_emb) @@ -866,7 +883,7 @@ class Vits(BaseTTS): for param in self.text_encoder.parameters(): param.requires_grad = False - if hasattr(self, 'emb_l'): + if hasattr(self, "emb_l"): for param in self.emb_l.parameters(): param.requires_grad = False @@ -932,7 +949,7 @@ class Vits(BaseTTS): with autocast(enabled=False): # use float32 for the criterion loss_dict = criterion[optimizer_idx]( waveform_hat=outputs["model_outputs"].float(), - waveform= outputs["waveform_seg"].float(), + waveform=outputs["waveform_seg"].float(), z_p=outputs["z_p"].float(), logs_q=outputs["logs_q"].float(), m_p=outputs["m_p"].float(), @@ -945,7 +962,7 @@ class Vits(BaseTTS): fine_tuning_mode=self.args.fine_tuning_mode, use_speaker_encoder_as_loss=self.args.use_speaker_encoder_as_loss, gt_spk_emb=outputs["gt_spk_emb"], - syn_spk_emb=outputs["syn_spk_emb"] + syn_spk_emb=outputs["syn_spk_emb"], ) # ignore duration loss if fine tuning mode is on if not self.args.fine_tuning_mode: diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py index 94be914c..5bacc259 100644 --- a/TTS/tts/utils/languages.py +++ b/TTS/tts/utils/languages.py @@ -1,13 +1,14 @@ -import os import json -import torch +import os +from typing import Dict, List, Tuple + import fsspec import numpy as np -from typing import Dict, Tuple, List +import torch from coqpit import Coqpit - from torch.utils.data.sampler import WeightedRandomSampler + class LanguageManager: """Manage the languages for multi-lingual 🐸TTS models. Load a datafile and parse the information in a way that can be queried by language. @@ -20,7 +21,9 @@ class LanguageManager: >>> manager = LanguageManager(language_id_file_path=language_id_file_path) >>> language_id_mapper = manager.language_ids """ + language_id_mapping: Dict = {} + def __init__( self, language_id_file_path: str = "", @@ -85,6 +88,7 @@ class LanguageManager: """ self._save_json(file_path, self.language_id_mapping) + def _set_file_path(path): """Find the language_ids.json under the given path or the above it. Intended to band aid the different paths returned in restored and continued training.""" @@ -97,6 +101,7 @@ def _set_file_path(path): return path_continue return None + def get_language_manager(c: Coqpit, data: List = None, restore_path: str = None) -> LanguageManager: """Initiate a `LanguageManager` instance by the provided config. @@ -118,7 +123,7 @@ def get_language_manager(c: Coqpit, data: List = None, restore_path: str = None) # restoring language manager from a previous run. if language_file: language_manager.set_language_ids_from_file(language_file) - if language_manager.num_languages > 0: + if language_manager.num_languages > 0: print( " > Language manager is loaded with {} languages: {}".format( language_manager.num_languages, ", ".join(language_manager.language_names) @@ -126,11 +131,12 @@ def get_language_manager(c: Coqpit, data: List = None, restore_path: str = None) ) return language_manager + def get_language_weighted_sampler(items: list): language_names = np.array([item[3] for item in items]) unique_language_names = np.unique(language_names).tolist() language_ids = [unique_language_names.index(l) for l in language_names] language_count = np.array([len(np.where(language_names == l)[0]) for l in unique_language_names]) - weight_language = 1. / language_count + weight_language = 1.0 / language_count dataset_samples_weight = torch.from_numpy(np.array([weight_language[l] for l in language_ids])).double() return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight)) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 8ccbdafc..d6381a70 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -432,11 +432,12 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, speaker_manager.save_speaker_ids_to_file(out_file_path) return speaker_manager + def get_speaker_weighted_sampler(items: list): speaker_names = np.array([item[2] for item in items]) unique_speaker_names = np.unique(speaker_names).tolist() speaker_ids = [unique_speaker_names.index(l) for l in speaker_names] speaker_count = np.array([len(np.where(speaker_names == l)[0]) for l in unique_speaker_names]) - weight_speaker = 1. / speaker_count + weight_speaker = 1.0 / speaker_count dataset_samples_weight = torch.from_numpy(np.array([weight_speaker[l] for l in speaker_ids])).double() - return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight)) \ No newline at end of file + return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight)) diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 826919c2..f3ffa478 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -136,8 +136,9 @@ def phoneme_cleaners(text): text = collapse_whitespace(text) return text + def multilingual_cleaners(text): - '''Pipeline for multilingual text''' + """Pipeline for multilingual text""" text = lowercase(text) text = replace_symbols(text, lang=None) text = remove_aux_symbols(text) diff --git a/tests/tts_tests/test_vits_multilingual_train.py b/tests/tts_tests/test_vits_multilingual_train.py index 664de57e..04b42e61 100644 --- a/tests/tts_tests/test_vits_multilingual_train.py +++ b/tests/tts_tests/test_vits_multilingual_train.py @@ -3,19 +3,27 @@ import os import shutil from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.vits_config import VitsConfig from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.configs.vits_config import VitsConfig config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") dataset_config1 = BaseDatasetConfig( - name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", path="tests/data/ljspeech", language="en" + name="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="en", ) dataset_config2 = BaseDatasetConfig( - name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", path="tests/data/ljspeech", language="en2" + name="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="en2", ) config = VitsConfig( From e995a63bd6f44d7fc1222aeaf15d227e134deac6 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 2 Nov 2021 19:10:18 +0100 Subject: [PATCH 145/220] fix linter --- TTS/bin/find_unique_phonemes.py | 1 - TTS/bin/remove_silence_using_vad.py | 27 ++++++++++----------------- TTS/tts/datasets/formatters.py | 2 +- TTS/tts/models/vits.py | 3 +-- TTS/tts/utils/speakers.py | 1 + notebooks/dataset_analysis/analyze.py | 2 +- 6 files changed, 14 insertions(+), 22 deletions(-) diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index bbc88fb6..ffad6891 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -3,7 +3,6 @@ import argparse import multiprocessing from argparse import RawTextHelpFormatter -import numpy from tqdm.contrib.concurrent import process_map from TTS.config import load_config diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py index 25ae26ef..8951662b 100755 --- a/TTS/bin/remove_silence_using_vad.py +++ b/TTS/bin/remove_silence_using_vad.py @@ -6,12 +6,7 @@ import glob import multiprocessing import os import pathlib -import sys import wave -from itertools import chain - -import numpy as np -import tqdm import webrtcvad from tqdm.contrib.concurrent import process_map @@ -47,8 +42,8 @@ def write_wave(path, audio, sample_rate): class Frame(object): """Represents a "frame" of audio data.""" - def __init__(self, bytes, timestamp, duration): - self.bytes = bytes + def __init__(self, _bytes, timestamp, duration): + self.bytes =_bytes self.timestamp = timestamp self.duration = duration @@ -121,7 +116,7 @@ def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, fram # We want to yield all the audio we see from now until # we are NOTTRIGGERED, but we have to start with the # audio that's already in the ring buffer. - for f, s in ring_buffer: + for f, _ in ring_buffer: voiced_frames.append(f) ring_buffer.clear() else: @@ -146,11 +141,10 @@ def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, fram def remove_silence(filepath): - filename = os.path.basename(filepath) output_path = filepath.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, "")) # ignore if the file exists if os.path.exists(output_path) and not args.force: - return False + return # create all directory structure pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True) padding_duration_ms = 300 # default 300 @@ -166,7 +160,7 @@ def remove_silence(filepath): if num_segments != 0: for i, segment in reversed(list(enumerate(segments))): if i >= 1: - if flag == False: + if not flag: concat_segment = segment flag = True else: @@ -176,11 +170,12 @@ def remove_silence(filepath): segment = segment + concat_segment write_wave(output_path, segment, sample_rate) print(output_path) - return True + return else: print("> Just Copying the file to:", output_path) # if fail to remove silence just write the file write_wave(output_path, audio, sample_rate) + return def preprocess_audios(): @@ -198,11 +193,9 @@ def preprocess_audios(): if __name__ == "__main__": - """ - usage - python remove_silence.py -i=VCTK-Corpus-bk/ -o=../VCTK-Corpus-removed-silence -g=wav48/*/*.wav -a=2 - """ - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser( + description="python remove_silence.py -i=VCTK-Corpus-bk/ -o=../VCTK-Corpus-removed-silence -g=wav48/*/*.wav -a=2" + ) parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir") parser.add_argument( "-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir" diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 7e65f21a..49a1ced4 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -59,7 +59,7 @@ def mozilla_de(root_path, meta_file, **kwargs): # pylint: disable=unused-argume return items -def mailabs(root_path, meta_files=None): +def mailabs(root_path, meta_files=None, ununsed_speakers=None): """Normalizes M-AI-Labs meta data files to TTS format Args: diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index c185150b..94d5bfc9 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -1,5 +1,4 @@ import math -import random from dataclasses import dataclass, field from itertools import chain from typing import Dict, List, Tuple @@ -747,7 +746,7 @@ class Vits(BaseTTS): # inverse decoder and get the output z_f_pred = self.flow(z_ft, y_mask, g=g, reverse=True) - z_slice, slice_ids = rand_segment(z_f_pred, y_lengths, self.spec_segment_size) + z_slice, slice_ids = rand_segments(z_f_pred, y_lengths, self.spec_segment_size) o = self.waveform_decoder(z_slice, g=g) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index d6381a70..8c248658 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -7,6 +7,7 @@ import fsspec import numpy as np import torch from coqpit import Coqpit +from torch.utils.data.sampler import WeightedRandomSampler from TTS.config import load_config from TTS.speaker_encoder.utils.generic_utils import setup_model diff --git a/notebooks/dataset_analysis/analyze.py b/notebooks/dataset_analysis/analyze.py index 9ba42fb9..4855886e 100644 --- a/notebooks/dataset_analysis/analyze.py +++ b/notebooks/dataset_analysis/analyze.py @@ -180,7 +180,7 @@ def plot_phonemes(train_path, cmu_dict_path, save_path): plt.figure() plt.rcParams["figure.figsize"] = (50, 20) - barplot = sns.barplot(x, y) + barplot = sns.barplot(x=x, y=y) if save_path: fig = barplot.get_figure() fig.savefig(os.path.join(save_path, "phoneme_dist")) From 13409381596f97ad83feb58913f51e2e765ace9c Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Thu, 4 Nov 2021 16:36:11 +0100 Subject: [PATCH 146/220] fix phonemes per language --- TTS/tts/datasets/dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 6d177743..513f2b12 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -273,6 +273,7 @@ class TTSDataset(Dataset): item = args[0] func_args = args[1] text, wav_file, *_ = item + func_args[3] = item[4] phonemes = TTSDataset._load_or_generate_phoneme_sequence(wav_file, text, *func_args) return phonemes From 846bf16f028311ee181daef8e49b2e8b284f4dfb Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Thu, 4 Nov 2021 16:36:40 +0100 Subject: [PATCH 147/220] fix imports for load_meta_data --- TTS/bin/find_unique_phonemes.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index ffad6891..ff7eac46 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -6,7 +6,7 @@ from argparse import RawTextHelpFormatter from tqdm.contrib.concurrent import process_map from TTS.config import load_config -from TTS.tts.datasets import load_meta_data +from TTS.tts.datasets import load_tts_samples from TTS.tts.utils.text import text2phone @@ -38,10 +38,9 @@ def main(): c = load_config(args.config_path) # load all datasets - train_items, eval_items = load_meta_data(c.datasets, eval_split=True) + train_items, eval_items = load_tts_samples(c.datasets, eval_split=True) items = train_items + eval_items print("Num items:", len(items)) - # items = items[:1000] phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15) phones = [] From 120332d53f8aba09f988adc7322a5c949cefa452 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Sat, 6 Nov 2021 00:27:58 +0100 Subject: [PATCH 148/220] Fix phonemes --- TTS/bin/find_unique_phonemes.py | 2 +- TTS/tts/datasets/dataset.py | 2 +- TTS/tts/models/vits.py | 2 ++ TTS/tts/utils/synthesis.py | 7 ++++--- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index ff7eac46..a869df27 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -7,7 +7,7 @@ from tqdm.contrib.concurrent import process_map from TTS.config import load_config from TTS.tts.datasets import load_tts_samples -from TTS.tts.utils.text import text2phone +from TTS.tts.utils.text import text2phone, phoneme_to_sequence def compute_phonemes(item): diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 513f2b12..38db31c3 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -273,7 +273,7 @@ class TTSDataset(Dataset): item = args[0] func_args = args[1] text, wav_file, *_ = item - func_args[3] = item[4] + func_args[3] = item[3] phonemes = TTSDataset._load_or_generate_phoneme_sequence(wav_file, text, *func_args) return phonemes diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 94d5bfc9..09537905 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -543,6 +543,7 @@ class Vits(BaseTTS): "style_wav": style_wav, "d_vector": d_vector, "language_id": language_id, + "language_name": language_name, } def forward( @@ -1061,6 +1062,7 @@ class Vits(BaseTTS): d_vector=aux_inputs["d_vector"], style_wav=aux_inputs["style_wav"], language_id=aux_inputs["language_id"], + language_name=aux_inputs["language_name"], enable_eos_bos_chars=self.config.enable_eos_bos_chars, use_griffin_lim=True, do_trim_silence=False, diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 6d998492..102914c5 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -15,7 +15,7 @@ if "tensorflow" in installed or "tensorflow-gpu" in installed: import tensorflow as tf -def text_to_seq(text, CONFIG, custom_symbols=None): +def text_to_seq(text, CONFIG, custom_symbols=None, language=None): text_cleaner = [CONFIG.text_cleaner] # text ot phonemes to sequence vector if CONFIG.use_phonemes: @@ -23,7 +23,7 @@ def text_to_seq(text, CONFIG, custom_symbols=None): phoneme_to_sequence( text, text_cleaner, - CONFIG.phoneme_language, + language if language else CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.characters, add_blank=CONFIG.add_blank, @@ -212,6 +212,7 @@ def synthesis( do_trim_silence=False, d_vector=None, language_id=None, + language_name=None, backend="torch", ): """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to @@ -262,7 +263,7 @@ def synthesis( if hasattr(model, "make_symbols"): custom_symbols = model.make_symbols(CONFIG) # preprocess the given text - text_inputs = text_to_seq(text, CONFIG, custom_symbols=custom_symbols) + text_inputs = text_to_seq(text, CONFIG, custom_symbols=custom_symbols, language=language_name) # pass tensors to backend if backend == "torch": if speaker_id is not None: From 23d789c0722afe88f0abf3b679ee9199d877eb7a Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 9 Nov 2021 12:20:11 +0100 Subject: [PATCH 149/220] Fix continue path --- TTS/bin/train_tts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index e28e9dec..2d7bd68f 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -9,6 +9,7 @@ from TTS.utils.audio import AudioProcessor def main(): + #os.environ["CUDA_VISIBLE_DEVICES"]="" """Run `tts` model training directly by a `config.json` file.""" # init trainer args train_args = TrainingArgs() @@ -64,7 +65,7 @@ def main(): train_samples=train_samples, eval_samples=eval_samples, training_assets={"audio_processor": ap}, - parse_command_line_args=False, + parse_command_line_args=True, ) trainer.fit() From e8af6a9f08c8f17277516dc8abc46bce60d46111 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 9 Nov 2021 12:20:43 +0100 Subject: [PATCH 150/220] Fix use_speaker_embedding logic --- TTS/tts/models/base_tts.py | 2 +- TTS/tts/models/vits.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index de00f6c7..707fc9c3 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -102,7 +102,7 @@ class BaseTTS(BaseModel): config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512 ) # init speaker embedding layer - if config.use_speaker_embedding and not config.use_d_vector_file: + if config.use_speaker_embedding: print(" > Init speaker_embedding layer.") self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 09537905..4d47cde1 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -578,7 +578,7 @@ class Vits(BaseTTS): outputs = {} sid, g, lid = self._set_cond_input(aux_input) # speaker embedding - if self.args.use_speaker_embedding and sid is not None and not self.use_d_vector: + if self.args.use_speaker_embedding and sid is not None: g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] # language embedding @@ -801,7 +801,7 @@ class Vits(BaseTTS): x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # speaker embedding - if self.args.use_speaker_embedding and sid is not None and not self.use_d_vector: + if self.args.use_speaker_embedding and sid is not None: g = self.emb_g(sid).unsqueeze(-1) # language embedding From da6c1e858c013d3ce917228ca1a71ade2be6640f Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 15 Nov 2021 22:09:59 +0100 Subject: [PATCH 151/220] Fix small issues --- TTS/bin/train_tts.py | 1 - TTS/tts/configs/vits_config.py | 2 +- TTS/tts/models/vits.py | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 2d7bd68f..1a9faf02 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -9,7 +9,6 @@ from TTS.utils.audio import AudioProcessor def main(): - #os.environ["CUDA_VISIBLE_DEVICES"]="" """Run `tts` model training directly by a `config.json` file.""" # init trainer args train_args = TrainingArgs() diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index a6f2210d..eeb74bbe 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -150,7 +150,7 @@ class VitsConfig(BaseTTSConfig): # use d-vectors use_d_vector_file: bool = False - d_vector_file: str = False + d_vector_file: str = None d_vector_dim: int = None def __post_init__(self): diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 4d47cde1..9f895fc1 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -211,7 +211,6 @@ class VitsArgs(Coqpit): d_vector_file: str = None speaker_embedding_channels: int = 256 use_d_vector_file: bool = False - d_vector_file: str = None d_vector_dim: int = 0 detach_dp_input: bool = True use_language_embedding: bool = False From 631addf33bbf7f07ea26808c439f199c90b82192 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Thu, 18 Nov 2021 00:17:42 +0100 Subject: [PATCH 152/220] fix d-vector --- TTS/tts/datasets/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 38db31c3..000393ea 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -415,7 +415,7 @@ class TTSDataset(Dataset): language_ids = None # get pre-computed d-vectors if self.d_vector_mapping is not None: - wav_files_names = [batch["wav_file_name"][idx] for idx in ids_sorted_decreasing] + wav_files_names = list(batch["wav_file_name"]) d_vectors = [self.d_vector_mapping[w]["embedding"] for w in wav_files_names] else: d_vectors = None From 352aa69ecab16fb76815ff3f0c9c349ff1a83330 Mon Sep 17 00:00:00 2001 From: Edresson Date: Sun, 21 Nov 2021 12:20:35 -0300 Subject: [PATCH 153/220] Create a module for the VAD script --- TTS/bin/remove_silence_using_vad.py | 164 +++------------------------- TTS/utils/vad.py | 142 ++++++++++++++++++++++++ 2 files changed, 160 insertions(+), 146 deletions(-) create mode 100644 TTS/utils/vad.py diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py index 8951662b..a32f0f45 100755 --- a/TTS/bin/remove_silence_using_vad.py +++ b/TTS/bin/remove_silence_using_vad.py @@ -1,162 +1,31 @@ -# This code is adpated from: https://github.com/wiseman/py-webrtcvad/blob/master/example.py -import argparse -import collections -import contextlib -import glob -import multiprocessing import os +import glob import pathlib -import wave -import webrtcvad +import argparse +import multiprocessing + from tqdm.contrib.concurrent import process_map - -def read_wave(path): - """Reads a .wav file. - - Takes the path, and returns (PCM audio data, sample rate). - """ - with contextlib.closing(wave.open(path, "rb")) as wf: - num_channels = wf.getnchannels() - assert num_channels == 1 - sample_width = wf.getsampwidth() - assert sample_width == 2 - sample_rate = wf.getframerate() - assert sample_rate in (8000, 16000, 32000, 48000) - pcm_data = wf.readframes(wf.getnframes()) - return pcm_data, sample_rate - - -def write_wave(path, audio, sample_rate): - """Writes a .wav file. - - Takes path, PCM audio data, and sample rate. - """ - with contextlib.closing(wave.open(path, "wb")) as wf: - wf.setnchannels(1) - wf.setsampwidth(2) - wf.setframerate(sample_rate) - wf.writeframes(audio) - - -class Frame(object): - """Represents a "frame" of audio data.""" - - def __init__(self, _bytes, timestamp, duration): - self.bytes =_bytes - self.timestamp = timestamp - self.duration = duration - - -def frame_generator(frame_duration_ms, audio, sample_rate): - """Generates audio frames from PCM audio data. - - Takes the desired frame duration in milliseconds, the PCM data, and - the sample rate. - - Yields Frames of the requested duration. - """ - n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) - offset = 0 - timestamp = 0.0 - duration = (float(n) / sample_rate) / 2.0 - while offset + n < len(audio): - yield Frame(audio[offset : offset + n], timestamp, duration) - timestamp += duration - offset += n - - -def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames): - """Filters out non-voiced audio frames. - - Given a webrtcvad.Vad and a source of audio frames, yields only - the voiced audio. - - Uses a padded, sliding window algorithm over the audio frames. - When more than 90% of the frames in the window are voiced (as - reported by the VAD), the collector triggers and begins yielding - audio frames. Then the collector waits until 90% of the frames in - the window are unvoiced to detrigger. - - The window is padded at the front and back to provide a small - amount of silence or the beginnings/endings of speech around the - voiced frames. - - Arguments: - - sample_rate - The audio sample rate, in Hz. - frame_duration_ms - The frame duration in milliseconds. - padding_duration_ms - The amount to pad the window, in milliseconds. - vad - An instance of webrtcvad.Vad. - frames - a source of audio frames (sequence or generator). - - Returns: A generator that yields PCM audio data. - """ - num_padding_frames = int(padding_duration_ms / frame_duration_ms) - # We use a deque for our sliding window/ring buffer. - ring_buffer = collections.deque(maxlen=num_padding_frames) - # We have two states: TRIGGERED and NOTTRIGGERED. We start in the - # NOTTRIGGERED state. - triggered = False - - voiced_frames = [] - for frame in frames: - is_speech = vad.is_speech(frame.bytes, sample_rate) - - # sys.stdout.write('1' if is_speech else '0') - if not triggered: - ring_buffer.append((frame, is_speech)) - num_voiced = len([f for f, speech in ring_buffer if speech]) - # If we're NOTTRIGGERED and more than 90% of the frames in - # the ring buffer are voiced frames, then enter the - # TRIGGERED state. - if num_voiced > 0.9 * ring_buffer.maxlen: - triggered = True - # sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,)) - # We want to yield all the audio we see from now until - # we are NOTTRIGGERED, but we have to start with the - # audio that's already in the ring buffer. - for f, _ in ring_buffer: - voiced_frames.append(f) - ring_buffer.clear() - else: - # We're in the TRIGGERED state, so collect the audio data - # and add it to the ring buffer. - voiced_frames.append(frame) - ring_buffer.append((frame, is_speech)) - num_unvoiced = len([f for f, speech in ring_buffer if not speech]) - # If more than 90% of the frames in the ring buffer are - # unvoiced, then enter NOTTRIGGERED and yield whatever - # audio we've collected. - if num_unvoiced > 0.9 * ring_buffer.maxlen: - # sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration)) - triggered = False - yield b"".join([f.bytes for f in voiced_frames]) - ring_buffer.clear() - voiced_frames = [] - # If we have any leftover voiced audio when we run out of input, - # yield it. - if voiced_frames: - yield b"".join([f.bytes for f in voiced_frames]) - +from TTS.utils.vad import read_wave, write_wave, get_vad_speech_segments def remove_silence(filepath): output_path = filepath.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, "")) # ignore if the file exists if os.path.exists(output_path) and not args.force: return + # create all directory structure pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True) - padding_duration_ms = 300 # default 300 + # load wave audio, sample_rate = read_wave(filepath) - vad = webrtcvad.Vad(int(args.aggressiveness)) - frames = frame_generator(30, audio, sample_rate) - frames = list(frames) - segments = vad_collector(sample_rate, 30, padding_duration_ms, vad, frames) - flag = False + + # get speech segments + segments = get_vad_speech_segments(audio, sample_rate, aggressiveness=args.aggressiveness) + segments = list(segments) num_segments = len(segments) - + flag = False + # create the output wave if num_segments != 0: for i, segment in reversed(list(enumerate(segments))): if i >= 1: @@ -168,8 +37,8 @@ def remove_silence(filepath): else: if flag: segment = segment + concat_segment + # print("Saving: ", output_path) write_wave(output_path, segment, sample_rate) - print(output_path) return else: print("> Just Copying the file to:", output_path) @@ -200,7 +69,10 @@ if __name__ == "__main__": parser.add_argument( "-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir" ) - parser.add_argument("-f", "--force", type=bool, default=True, help="Force the replace of exists files") + parser.add_argument("-f", "--force", + default=False, + action='store_true', + help='Force the replace of exists files') parser.add_argument( "-g", "--glob", diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py new file mode 100644 index 00000000..4e61f490 --- /dev/null +++ b/TTS/utils/vad.py @@ -0,0 +1,142 @@ +# This code is adpated from: https://github.com/wiseman/py-webrtcvad/blob/master/example.py +import wave +import webrtcvad +import contextlib +import collections + + +def read_wave(path): + """Reads a .wav file. + + Takes the path, and returns (PCM audio data, sample rate). + """ + with contextlib.closing(wave.open(path, "rb")) as wf: + num_channels = wf.getnchannels() + assert num_channels == 1 + sample_width = wf.getsampwidth() + assert sample_width == 2 + sample_rate = wf.getframerate() + assert sample_rate in (8000, 16000, 32000, 48000) + pcm_data = wf.readframes(wf.getnframes()) + return pcm_data, sample_rate + + +def write_wave(path, audio, sample_rate): + """Writes a .wav file. + + Takes path, PCM audio data, and sample rate. + """ + with contextlib.closing(wave.open(path, "wb")) as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(sample_rate) + wf.writeframes(audio) + + +class Frame(object): + """Represents a "frame" of audio data.""" + + def __init__(self, _bytes, timestamp, duration): + self.bytes =_bytes + self.timestamp = timestamp + self.duration = duration + + +def frame_generator(frame_duration_ms, audio, sample_rate): + """Generates audio frames from PCM audio data. + + Takes the desired frame duration in milliseconds, the PCM data, and + the sample rate. + + Yields Frames of the requested duration. + """ + n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) + offset = 0 + timestamp = 0.0 + duration = (float(n) / sample_rate) / 2.0 + while offset + n < len(audio): + yield Frame(audio[offset : offset + n], timestamp, duration) + timestamp += duration + offset += n + + +def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames): + """Filters out non-voiced audio frames. + + Given a webrtcvad.Vad and a source of audio frames, yields only + the voiced audio. + + Uses a padded, sliding window algorithm over the audio frames. + When more than 90% of the frames in the window are voiced (as + reported by the VAD), the collector triggers and begins yielding + audio frames. Then the collector waits until 90% of the frames in + the window are unvoiced to detrigger. + + The window is padded at the front and back to provide a small + amount of silence or the beginnings/endings of speech around the + voiced frames. + + Arguments: + + sample_rate - The audio sample rate, in Hz. + frame_duration_ms - The frame duration in milliseconds. + padding_duration_ms - The amount to pad the window, in milliseconds. + vad - An instance of webrtcvad.Vad. + frames - a source of audio frames (sequence or generator). + + Returns: A generator that yields PCM audio data. + """ + num_padding_frames = int(padding_duration_ms / frame_duration_ms) + # We use a deque for our sliding window/ring buffer. + ring_buffer = collections.deque(maxlen=num_padding_frames) + # We have two states: TRIGGERED and NOTTRIGGERED. We start in the + # NOTTRIGGERED state. + triggered = False + + voiced_frames = [] + for frame in frames: + is_speech = vad.is_speech(frame.bytes, sample_rate) + + # sys.stdout.write('1' if is_speech else '0') + if not triggered: + ring_buffer.append((frame, is_speech)) + num_voiced = len([f for f, speech in ring_buffer if speech]) + # If we're NOTTRIGGERED and more than 90% of the frames in + # the ring buffer are voiced frames, then enter the + # TRIGGERED state. + if num_voiced > 0.9 * ring_buffer.maxlen: + triggered = True + # sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,)) + # We want to yield all the audio we see from now until + # we are NOTTRIGGERED, but we have to start with the + # audio that's already in the ring buffer. + for f, _ in ring_buffer: + voiced_frames.append(f) + ring_buffer.clear() + else: + # We're in the TRIGGERED state, so collect the audio data + # and add it to the ring buffer. + voiced_frames.append(frame) + ring_buffer.append((frame, is_speech)) + num_unvoiced = len([f for f, speech in ring_buffer if not speech]) + # If more than 90% of the frames in the ring buffer are + # unvoiced, then enter NOTTRIGGERED and yield whatever + # audio we've collected. + if num_unvoiced > 0.9 * ring_buffer.maxlen: + # sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration)) + triggered = False + yield b"".join([f.bytes for f in voiced_frames]) + ring_buffer.clear() + voiced_frames = [] + # If we have any leftover voiced audio when we run out of input, + # yield it. + if voiced_frames: + yield b"".join([f.bytes for f in voiced_frames]) + +def get_vad_speech_segments(audio, sample_rate, aggressiveness=2, padding_duration_ms=300): + + vad = webrtcvad.Vad(int(aggressiveness)) + frames = list(frame_generator(30, audio, sample_rate)) + segments = vad_collector(sample_rate, 30, padding_duration_ms, vad, frames) + + return segments \ No newline at end of file From 6fc3b9e67943ce84832d2e4c0a31ea8b9aef7300 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 08:19:36 -0300 Subject: [PATCH 154/220] Remove the unusable fine-tuning model --- TTS/tts/layers/losses.py | 10 +-- TTS/tts/models/vits.py | 159 ++------------------------------------- 2 files changed, 10 insertions(+), 159 deletions(-) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 93a5bad2..acf750a0 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -599,7 +599,6 @@ class VitsGeneratorLoss(nn.Module): feats_disc_fake, feats_disc_real, loss_duration, - fine_tuning_mode=0, use_speaker_encoder_as_loss=False, gt_spk_emb=None, syn_spk_emb=None, @@ -623,14 +622,9 @@ class VitsGeneratorLoss(nn.Module): # compute mel spectrograms from the waveforms mel = self.stft(waveform) mel_hat = self.stft(waveform_hat) + # compute losses - - # ignore tts model loss if fine tunning mode is on - if fine_tuning_mode: - loss_kl = 0.0 - else: - loss_kl = self.kl_loss(z_p, logs_q, m_p, logs_p, z_mask.unsqueeze(1)) * self.kl_loss_alpha - + loss_kl = self.kl_loss(z_p, logs_q, m_p, logs_p, z_mask.unsqueeze(1)) * self.kl_loss_alpha loss_feat = self.feature_loss(feats_disc_fake, feats_disc_real) * self.feat_loss_alpha loss_gen = self.generator_loss(scores_disc_fake)[0] * self.gen_loss_alpha loss_mel = torch.nn.functional.l1_loss(mel, mel_hat) * self.mel_loss_alpha diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 9f895fc1..0abf0ca3 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -167,11 +167,6 @@ class VitsArgs(Coqpit): speaker_encoder_model_path (str): Path to the file speaker encoder checkpoint file, to use for SCL. Defaults to "". - fine_tuning_mode (int): - Fine tuning only the vocoder part of the model, while the rest will be frozen. Defaults to 0. - Mode 0: Disabled; - Mode 1: uses the distribution predicted by the encoder and It's recommended for TTS; - Mode 2: uses the distribution predicted by the encoder and It's recommended for voice conversion. """ num_chars: int = 100 @@ -219,7 +214,6 @@ class VitsArgs(Coqpit): use_speaker_encoder_as_loss: bool = False speaker_encoder_config_path: str = "" speaker_encoder_model_path: str = "" - fine_tuning_mode: int = 0 freeze_encoder: bool = False freeze_DP: bool = False freeze_PE: bool = False @@ -672,122 +666,6 @@ class Vits(BaseTTS): ) return outputs - def forward_fine_tuning( - self, - x: torch.tensor, - x_lengths: torch.tensor, - y: torch.tensor, - y_lengths: torch.tensor, - aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}, - waveform=None, - ) -> Dict: - """Forward pass of the model. - - Args: - x (torch.tensor): Batch of input character sequence IDs. - x_lengths (torch.tensor): Batch of input character sequence lengths. - y (torch.tensor): Batch of input spectrograms. - y_lengths (torch.tensor): Batch of input spectrogram lengths. - aux_input (dict, optional): Auxiliary inputs for multi-speaker training. Defaults to {"d_vectors": None, "speaker_ids": None}. - - Returns: - Dict: model outputs keyed by the output name. - - Shapes: - - x: :math:`[B, T_seq]` - - x_lengths: :math:`[B]` - - y: :math:`[B, C, T_spec]` - - y_lengths: :math:`[B]` - - d_vectors: :math:`[B, C, 1]` - - speaker_ids: :math:`[B]` - """ - with torch.no_grad(): - outputs = {} - sid, g, lid = self._set_cond_input(aux_input) - # speaker embedding - if self.args.use_speaker_embedding and sid is not None and not self.use_d_vector: - g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] - - # language embedding - lang_emb = None - if self.args.use_language_embedding and lid is not None: - lang_emb = self.emb_l(lid).unsqueeze(-1) - - x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb) - - # posterior encoder - z, m_q, logs_q, y_mask = self.posterior_encoder(y, y_lengths, g=g) - - # flow layers - z_p = self.flow(z, y_mask, g=g) - - # find the alignment path - attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) - with torch.no_grad(): - o_scale = torch.exp(-2 * logs_p) - logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1]).unsqueeze(-1) # [b, t, 1] - logp2 = torch.einsum("klm, kln -> kmn", [o_scale, -0.5 * (z_p ** 2)]) - logp3 = torch.einsum("klm, kln -> kmn", [m_p * o_scale, z_p]) - logp4 = torch.sum(-0.5 * (m_p ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] - logp = logp2 + logp3 + logp1 + logp4 - attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() - - # expand prior - m_p = torch.einsum("klmn, kjm -> kjn", [attn, m_p]) - logs_p = torch.einsum("klmn, kjm -> kjn", [attn, logs_p]) - - # mode 1: like SC-GlowTTS paper; mode 2: recommended for voice conversion - if self.args.fine_tuning_mode == 1: - z_ft = m_p - elif self.args.fine_tuning_mode == 2: - z_ft = z_p - else: - raise RuntimeError(" [!] Invalid Fine Tunning Mode !") - - # inverse decoder and get the output - z_f_pred = self.flow(z_ft, y_mask, g=g, reverse=True) - z_slice, slice_ids = rand_segments(z_f_pred, y_lengths, self.spec_segment_size) - - o = self.waveform_decoder(z_slice, g=g) - - wav_seg = segment( - waveform.transpose(1, 2), - slice_ids * self.config.audio.hop_length, - self.args.spec_segment_size * self.config.audio.hop_length, - ) - - if self.args.use_speaker_encoder_as_loss and self.speaker_encoder is not None: - # concate generated and GT waveforms - wavs_batch = torch.cat((wav_seg, o), dim=0).squeeze(1) - - # resample audio to speaker encoder sample_rate - if self.audio_transform is not None: - wavs_batch = self.audio_transform(wavs_batch) - - pred_embs = self.speaker_encoder.forward(wavs_batch, l2_norm=True) - - # split generated and GT speaker embeddings - gt_spk_emb, syn_spk_emb = torch.chunk(pred_embs, 2, dim=0) - else: - gt_spk_emb, syn_spk_emb = None, None - - outputs.update( - { - "model_outputs": o, - "alignments": attn.squeeze(1), - "loss_duration": 0.0, - "z": z, - "z_p": z_p, - "m_p": m_p, - "logs_p": logs_p, - "m_q": m_q, - "logs_q": logs_q, - "waveform_seg": wav_seg, - "gt_spk_emb": gt_spk_emb, - "syn_spk_emb": syn_spk_emb, - } - ) - return outputs def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}): """ @@ -869,15 +747,6 @@ class Vits(BaseTTS): if optimizer_idx not in [0, 1]: raise ValueError(" [!] Unexpected `optimizer_idx`.") - # generator pass - if self.args.fine_tuning_mode: - # ToDo: find better place fot it - # force eval mode - self.eval() - # restore train mode for the vocoder part - self.waveform_decoder.train() - self.disc.train() - if self.args.freeze_encoder: for param in self.text_encoder.parameters(): param.requires_grad = False @@ -913,25 +782,14 @@ class Vits(BaseTTS): waveform = batch["waveform"] # generator pass - if self.args.fine_tuning_mode: - # model forward - outputs = self.forward_fine_tuning( - text_input, - text_lengths, - linear_input.transpose(1, 2), - mel_lengths, - aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids}, - waveform=waveform, - ) - else: - outputs = self.forward( - text_input, - text_lengths, - linear_input.transpose(1, 2), - mel_lengths, - aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids}, - waveform=waveform, - ) + outputs = self.forward( + text_input, + text_lengths, + linear_input.transpose(1, 2), + mel_lengths, + aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids}, + waveform=waveform, + ) # cache tensors for the discriminator self.y_disc_cache = None @@ -958,7 +816,6 @@ class Vits(BaseTTS): feats_disc_fake=outputs["feats_disc_fake"], feats_disc_real=outputs["feats_disc_real"], loss_duration=outputs["loss_duration"], - fine_tuning_mode=self.args.fine_tuning_mode, use_speaker_encoder_as_loss=self.args.use_speaker_encoder_as_loss, gt_spk_emb=outputs["gt_spk_emb"], syn_spk_emb=outputs["syn_spk_emb"], From 8c22d5ac49aada3e22e0ae56b89ef9168f41fac2 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 08:48:56 -0300 Subject: [PATCH 155/220] Turn more clear the VITS loss function --- TTS/tts/layers/losses.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index acf750a0..9c219998 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -586,6 +586,11 @@ class VitsGeneratorLoss(nn.Module): l = kl / torch.sum(z_mask) return l + @staticmethod + def cosine_similarity_loss(gt_spk_emb, syn_spk_emb): + l = - torch.nn.functional.cosine_similarity(gt_spk_emb, syn_spk_emb).mean() + return l + def forward( self, waveform, @@ -632,9 +637,7 @@ class VitsGeneratorLoss(nn.Module): loss = loss_kl + loss_feat + loss_mel + loss_gen + loss_duration if use_speaker_encoder_as_loss: - loss_se = ( - -torch.nn.functional.cosine_similarity(gt_spk_emb, syn_spk_emb).mean() * self.spk_encoder_loss_alpha - ) + loss_se = self.cosine_similarity_loss(gt_spk_emb, syn_spk_emb) * self.spk_encoder_loss_alpha loss += loss_se return_dict["loss_spk_encoder"] = loss_se From 9daa33d1fde56f284009ca6feab32acef32af068 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 08:57:13 -0300 Subject: [PATCH 156/220] Remove unusable speaker manager function --- TTS/tts/models/vits.py | 2 +- TTS/tts/utils/speakers.py | 14 -------------- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 0abf0ca3..7b27bc73 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -519,7 +519,7 @@ class Vits(BaseTTS): if speaker_name is None: d_vector = self.speaker_manager.get_random_d_vector() else: - d_vector = self.speaker_manager.get_d_vector_by_speaker(speaker_name) + d_vector = self.speaker_manager.get_mean_d_vector(speaker_name, num_samples=1, randomize=False) elif config.use_speaker_embedding: if speaker_name is None: speaker_id = self.speaker_manager.get_random_speaker_id() diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 8c248658..828abede 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -194,20 +194,6 @@ class SpeakerManager: """ return [x["embedding"] for x in self.d_vectors.values() if x["name"] == speaker_idx] - def get_d_vector_by_speaker(self, speaker_idx: str) -> np.ndarray: - """Get a d_vector of a speaker. - - Args: - speaker_idx (str): Target speaker ID. - - Returns: - np.ndarray: d_vector. - """ - for x in self.d_vectors.values(): - if x["name"] == speaker_idx: - return x["embedding"] - return None - def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: """Get mean d_vector of a speaker ID. From b769b49e34140f45e25bd677b26b8c4448dd5382 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 09:05:28 -0300 Subject: [PATCH 157/220] Remove the data from the set_d_vectors_from_file function --- TTS/tts/utils/speakers.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 828abede..c1eede3d 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -155,7 +155,7 @@ class SpeakerManager: """ self._save_json(file_path, self.d_vectors) - def set_d_vectors_from_file(self, file_path: str, data: List = None) -> None: + def set_d_vectors_from_file(self, file_path: str) -> None: """Load d_vectors from a json file. Args: @@ -163,12 +163,8 @@ class SpeakerManager: """ self.d_vectors = self._load_json(file_path) - # load speakers from data, because during the training we can just use some speakers from d_vector_file - if data is not None: - self.speaker_ids, _ = self.parse_speakers_from_data(data) - else: - speakers = sorted({x["name"] for x in self.d_vectors.values()}) - self.speaker_ids = {name: i for i, name in enumerate(speakers)} + speakers = sorted({x["name"] for x in self.d_vectors.values()}) + self.speaker_ids = {name: i for i, name in enumerate(speakers)} self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys()))) @@ -386,7 +382,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.d_vector_file" ) speaker_manager.load_d_vectors_file(c.d_vector_file) - speaker_manager.set_d_vectors_from_file(speakers_file, data=data) + speaker_manager.set_d_vectors_from_file(speakers_file) elif not c.use_d_vector_file: # restor speaker manager with speaker ID file. speaker_ids_from_data = speaker_manager.speaker_ids speaker_manager.set_speaker_ids_from_file(speakers_file) @@ -395,7 +391,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, ), " [!] You cannot introduce new speakers to a pre-trained model." elif c.use_d_vector_file and c.d_vector_file: # new speaker manager with external speaker embeddings. - speaker_manager.set_d_vectors_from_file(c.d_vector_file, data=data) + speaker_manager.set_d_vectors_from_file(c.d_vector_file) elif c.use_d_vector_file and not c.d_vector_file: raise "use_d_vector_file is True, so you need pass a external speaker embedding file." elif c.use_speaker_embedding and "speakers_file" in c and c.speakers_file: From 34749f872770fc2c5f7726ef5847008d87015235 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 09:18:29 -0300 Subject: [PATCH 158/220] Remove the call to get_speaker_manager --- TTS/bin/extract_tts_spectrograms.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 0af98ff1..e46e4a00 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -12,7 +12,7 @@ from tqdm import tqdm from TTS.config import load_config from TTS.tts.datasets import TTSDataset, load_tts_samples from TTS.tts.models import setup_model -from TTS.tts.utils.speakers import get_speaker_manager +from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import count_parameters @@ -234,8 +234,13 @@ def main(args): # pylint: disable=redefined-outer-name # use eval and training partitions meta_data = meta_data_train + meta_data_eval - # parse speakers - speaker_manager = get_speaker_manager(c, args, meta_data_train) + # init speaker manager + if config.use_speaker_embedding: + speaker_manager = SpeakerManager(data_items=meta_data) + elif config.use_d_vector_file: + speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file) + else: + speaker_manager = None # setup model model = setup_model(c) From 2b2cecaea284b1cce4184ede547456f63e2ac3c9 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 09:25:32 -0300 Subject: [PATCH 159/220] Set the new_fields in copy_model_files as None by default --- TTS/trainer.py | 4 ++-- TTS/utils/io.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index c151e716..b9026c8e 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -202,7 +202,7 @@ class Trainer: os.makedirs(output_path, exist_ok=True) # copy training assets to the output folder - copy_model_files(config, output_path, new_fields=None) + copy_model_files(config, output_path) # init class members self.args = args @@ -274,7 +274,7 @@ class Trainer: self.config.num_languages = self.model.language_manager.num_languages # update config file - copy_model_files(self.config, self.output_path, None) + copy_model_files(self.config, self.output_path) # setup criterion self.criterion = self.get_criterion(self.model) diff --git a/TTS/utils/io.py b/TTS/utils/io.py index a93f6118..54818ce9 100644 --- a/TTS/utils/io.py +++ b/TTS/utils/io.py @@ -26,7 +26,7 @@ class AttrDict(dict): self.__dict__ = self -def copy_model_files(config: Coqpit, out_path, new_fields): +def copy_model_files(config: Coqpit, out_path, new_fields=None): """Copy config.json and other model files to training folder and add new fields. From 85418ffeaa93cda22ac0be30855f55a33b64ce13 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 13:43:44 -0300 Subject: [PATCH 160/220] Fix the bug in extract tts spectrograms --- TTS/bin/extract_tts_spectrograms.py | 9 +++++---- TTS/bin/train_tts.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index e46e4a00..014ba4e8 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -37,8 +37,8 @@ def setup_loader(ap, r, verbose=False): enable_eos_bos=c.enable_eos_bos_chars, use_noise_augment=False, verbose=verbose, - speaker_id_mapping=speaker_manager.speaker_ids, - d_vector_mapping=speaker_manager.d_vectors if c.use_speaker_embedding and c.use_d_vector_file else None, + speaker_id_mapping=speaker_manager.speaker_ids if c.use_speaker_embedding else None, + d_vector_mapping=speaker_manager.d_vectors if c.use_d_vector_file else None, ) if c.use_phonemes and c.compute_input_seq_cache: @@ -235,13 +235,14 @@ def main(args): # pylint: disable=redefined-outer-name meta_data = meta_data_train + meta_data_eval # init speaker manager - if config.use_speaker_embedding: + if c.use_speaker_embedding: speaker_manager = SpeakerManager(data_items=meta_data) - elif config.use_d_vector_file: + elif c.use_d_vector_file: speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file) else: speaker_manager = None + # setup model model = setup_model(c) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 1a9faf02..e28e9dec 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -64,7 +64,7 @@ def main(): train_samples=train_samples, eval_samples=eval_samples, training_assets={"audio_processor": ap}, - parse_command_line_args=True, + parse_command_line_args=False, ) trainer.fit() From 45d0b04179d0055db1a847e43c3e62927c7b1989 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 15:55:00 -0300 Subject: [PATCH 161/220] Lint fixs --- TTS/bin/find_unique_phonemes.py | 4 ++-- TTS/speaker_encoder/models/resnet.py | 1 + TTS/tts/configs/vits_config.py | 4 ++-- TTS/utils/vad.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index a869df27..832ef082 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -7,7 +7,7 @@ from tqdm.contrib.concurrent import process_map from TTS.config import load_config from TTS.tts.datasets import load_tts_samples -from TTS.tts.utils.text import text2phone, phoneme_to_sequence +from TTS.tts.utils.text import text2phone def compute_phonemes(item): @@ -19,8 +19,8 @@ def compute_phonemes(item): return [] return list(set(ph)) - def main(): + # pylint: disable=W0601 global c # pylint: disable=bad-option-value parser = argparse.ArgumentParser( diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index 42f041b4..47b6f23f 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -176,6 +176,7 @@ class ResNetSpeakerEncoder(nn.Module): def forward(self, x, l2_norm=False): with torch.no_grad(): with torch.cuda.amp.autocast(enabled=False): + # if you torch spec compute it otherwise use the mel spec computed by the AP if self.use_torch_spec: x = self.torch_spec(x) else: diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index eeb74bbe..178992a7 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -154,6 +154,6 @@ class VitsConfig(BaseTTSConfig): d_vector_dim: int = None def __post_init__(self): - for key in self.model_args.keys(): + for key, val in self.model_args.items(): if hasattr(self, key): - self[key] = self.model_args[key] + self[key] = val diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py index 4e61f490..33548087 100644 --- a/TTS/utils/vad.py +++ b/TTS/utils/vad.py @@ -139,4 +139,4 @@ def get_vad_speech_segments(audio, sample_rate, aggressiveness=2, padding_durati frames = list(frame_generator(30, audio, sample_rate)) segments = vad_collector(sample_rate, 30, padding_duration_ms, vad, frames) - return segments \ No newline at end of file + return segments From f34596d9572477d40d108b40734fb5be5f2b071b Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 16:41:46 -0300 Subject: [PATCH 162/220] Fix function name --- TTS/tts/models/vits.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 7b27bc73..ccd742b1 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -491,7 +491,7 @@ class Vits(BaseTTS): sid, g, lid = self._set_cond_input(aux_input) return {"speaker_id": sid, "style_wav": None, "d_vector": g, "language_id": lid} - def get_aux_input_from_test_setences(self, sentence_info): + def get_aux_input_from_test_sentences(self, sentence_info): if hasattr(self.config, "model_args"): config = self.config.model_args else: @@ -907,7 +907,7 @@ class Vits(BaseTTS): test_sentences = self.config.test_sentences for idx, s_info in enumerate(test_sentences): try: - aux_inputs = self.get_aux_input_from_test_setences(s_info) + aux_inputs = self.get_aux_input_from_test_sentences(s_info) wav, alignment, _, _ = synthesis( self, aux_inputs["text"], From 9ca3af900fc3becea86dd3e4296ca26b3768eec4 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Mon, 22 Nov 2021 22:42:12 +0100 Subject: [PATCH 163/220] remove inference notebook --- .../VITS_d-vector_multilingual_exemple.ipynb | 223 ------------------ 1 file changed, 223 deletions(-) delete mode 100644 notebooks/VITS_d-vector_multilingual_exemple.ipynb diff --git a/notebooks/VITS_d-vector_multilingual_exemple.ipynb b/notebooks/VITS_d-vector_multilingual_exemple.ipynb deleted file mode 100644 index 41713295..00000000 --- a/notebooks/VITS_d-vector_multilingual_exemple.ipynb +++ /dev/null @@ -1,223 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "source": [ - "import IPython\n", - "import torch\n", - "\n", - "from IPython.display import Audio\n", - "\n", - "from TTS.config import load_config\n", - "from TTS.tts.models import setup_model\n", - "from TTS.tts.utils.synthesis import synthesis\n", - "from TTS.utils.audio import AudioProcessor" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 2, - "source": [ - "GENERAL_PATH = '/home/julian/workspace/train/VITS-pt-en-fr-lr/vits-August-29-2021_01+20PM-c68d7fa25/'\n", - "MODEL_PATH = GENERAL_PATH + 'best_model.pth.tar'\n", - "CONFIG_PATH = GENERAL_PATH + 'config.json'\n", - "TTS_LANGUAGES = GENERAL_PATH + \"language_ids.json\"\n", - "TTS_SPEAKERS = GENERAL_PATH + \"speakers.json\"\n", - "USE_CUDA = torch.cuda.is_available()" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 3, - "source": [ - "# load the config\n", - "C = load_config(CONFIG_PATH)\n", - "\n", - "# load the audio processor\n", - "ap = AudioProcessor(**C.audio)\n", - "\n", - "speaker_embedding = None\n", - "\n", - "C.model_args['d_vector_file'] = TTS_SPEAKERS\n", - "\n", - "model = setup_model(C)\n", - "model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)\n", - "cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n", - "model.load_state_dict(cp['model'])\n", - "\n", - "\n", - "model.eval()\n", - "\n", - "if USE_CUDA:\n", - " model = model.cuda()\n", - "\n", - "use_griffin_lim = True" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - " > Setting up Audio Processor...\n", - " | > sample_rate:16000\n", - " | > resample:False\n", - " | > num_mels:80\n", - " | > min_level_db:-100\n", - " | > frame_shift_ms:None\n", - " | > frame_length_ms:None\n", - " | > ref_level_db:20\n", - " | > fft_size:1024\n", - " | > power:1.5\n", - " | > preemphasis:0.0\n", - " | > griffin_lim_iters:60\n", - " | > signal_norm:False\n", - " | > symmetric_norm:True\n", - " | > mel_fmin:0\n", - " | > mel_fmax:None\n", - " | > spec_gain:1.0\n", - " | > stft_pad_mode:reflect\n", - " | > max_norm:4.0\n", - " | > clip_norm:True\n", - " | > do_trim_silence:True\n", - " | > trim_db:45\n", - " | > do_sound_norm:False\n", - " | > do_amp_to_db_linear:False\n", - " | > do_amp_to_db_mel:True\n", - " | > stats_path:None\n", - " | > base:2.718281828459045\n", - " | > hop_length:256\n", - " | > win_length:1024\n", - " > Using model: vits\n", - " > Speaker manager is loaded with 421 speakers: ED, MLS_10032, MLS_10058, MLS_10065, MLS_10082, MLS_10087, MLS_10177, MLS_103, MLS_10620, MLS_10827, MLS_10957, MLS_112, MLS_11247, MLS_1127, MLS_115, MLS_11743, MLS_11772, MLS_11795, MLS_11822, MLS_11875, MLS_11954, MLS_12205, MLS_123, MLS_1243, MLS_125, MLS_12501, MLS_12512, MLS_12541, MLS_12709, MLS_12713, MLS_12823, MLS_12899, MLS_12968, MLS_12981, MLS_13142, MLS_13177, MLS_1329, MLS_13611, MLS_13634, MLS_13655, MLS_13658, MLS_14, MLS_1474, MLS_1579, MLS_1590, MLS_1591, MLS_1624, MLS_1649, MLS_1664, MLS_1745, MLS_177, MLS_1770, MLS_1798, MLS_1805, MLS_1817, MLS_1840, MLS_1844, MLS_1869, MLS_1887, MLS_1977, MLS_1989, MLS_2033, MLS_204, MLS_2155, MLS_2284, MLS_2297, MLS_2316, MLS_2506, MLS_2544, MLS_2587, MLS_2596, MLS_2607, MLS_27, MLS_2771, MLS_2776, MLS_28, MLS_2825, MLS_2904, MLS_2926, MLS_2946, MLS_30, MLS_3060, MLS_3182, MLS_3190, MLS_3204, MLS_3267, MLS_3270, MLS_3319, MLS_3344, MLS_3370, MLS_3464, MLS_3503, MLS_3595, MLS_3698, MLS_4018, MLS_4174, MLS_4193, MLS_4336, MLS_4396, MLS_4512, MLS_4609, MLS_4650, MLS_4699, MLS_4724, MLS_4744, MLS_4937, MLS_5021, MLS_5077, MLS_52, MLS_5232, MLS_5295, MLS_5525, MLS_5526, MLS_5553, MLS_5595, MLS_5612, MLS_5764, MLS_577, MLS_579, MLS_5830, MLS_5840, MLS_5968, MLS_6070, MLS_6128, MLS_62, MLS_6249, MLS_6318, MLS_6348, MLS_6362, MLS_6381, MLS_66, MLS_6856, MLS_694, MLS_7032, MLS_707, MLS_7142, MLS_7150, MLS_7193, MLS_7200, MLS_7239, MLS_7377, MLS_7423, MLS_7438, MLS_7439, MLS_753, MLS_7591, MLS_7601, MLS_7614, MLS_7679, MLS_78, MLS_7848, MLS_8102, MLS_8128, MLS_8582, MLS_8778, MLS_9121, MLS_9242, MLS_928, MLS_94, MLS_9804, MLS_9854, VCTK_p225, VCTK_p226, VCTK_p227, VCTK_p228, VCTK_p229, VCTK_p230, VCTK_p231, VCTK_p232, VCTK_p233, VCTK_p234, VCTK_p236, VCTK_p237, VCTK_p238, VCTK_p239, VCTK_p240, VCTK_p241, VCTK_p243, VCTK_p244, VCTK_p245, VCTK_p246, VCTK_p247, VCTK_p248, VCTK_p249, VCTK_p250, VCTK_p251, VCTK_p252, VCTK_p253, VCTK_p254, VCTK_p255, VCTK_p256, VCTK_p257, VCTK_p258, VCTK_p259, VCTK_p260, VCTK_p261, VCTK_p262, VCTK_p263, VCTK_p264, VCTK_p265, VCTK_p266, VCTK_p267, VCTK_p268, VCTK_p269, VCTK_p270, VCTK_p271, VCTK_p272, VCTK_p273, VCTK_p274, VCTK_p275, VCTK_p276, VCTK_p277, VCTK_p278, VCTK_p279, VCTK_p280, VCTK_p281, VCTK_p282, VCTK_p283, VCTK_p284, VCTK_p285, VCTK_p286, VCTK_p287, VCTK_p288, VCTK_p292, VCTK_p293, VCTK_p294, VCTK_p295, VCTK_p297, VCTK_p298, VCTK_p299, VCTK_p300, VCTK_p301, VCTK_p302, VCTK_p303, VCTK_p304, VCTK_p305, VCTK_p306, VCTK_p307, VCTK_p308, VCTK_p310, VCTK_p311, VCTK_p312, VCTK_p313, VCTK_p314, VCTK_p316, VCTK_p317, VCTK_p318, VCTK_p323, VCTK_p326, VCTK_p329, VCTK_p330, VCTK_p333, VCTK_p334, VCTK_p335, VCTK_p336, VCTK_p339, VCTK_p340, VCTK_p341, VCTK_p343, VCTK_p345, VCTK_p347, VCTK_p351, VCTK_p360, VCTK_p361, VCTK_p362, VCTK_p363, VCTK_p364, VCTK_p374, VCTK_p376, bernard, elodie, ezwa, gilles_g_le_blanc, nadine_eckert_boulet, openSLR_afr0184, openSLR_afr1919, openSLR_afr2418, openSLR_afr6590, openSLR_afr7130, openSLR_afr7214, openSLR_afr8148, openSLR_afr8924, openSLR_afr8963, openSLR_jvf00264, openSLR_jvf00658, openSLR_jvf01392, openSLR_jvf02059, openSLR_jvf02884, openSLR_jvf03187, openSLR_jvf04679, openSLR_jvf04715, openSLR_jvf04982, openSLR_jvf05540, openSLR_jvf06207, openSLR_jvf06510, openSLR_jvf06941, openSLR_jvf07335, openSLR_jvf07638, openSLR_jvf08002, openSLR_jvf08305, openSLR_jvf08736, openSLR_jvf09039, openSLR_jvm00027, openSLR_jvm01519, openSLR_jvm01932, openSLR_jvm02326, openSLR_jvm03314, openSLR_jvm03424, openSLR_jvm03727, openSLR_jvm04175, openSLR_jvm04285, openSLR_jvm04588, openSLR_jvm05219, openSLR_jvm05522, openSLR_jvm05667, openSLR_jvm05970, openSLR_jvm06080, openSLR_jvm06383, openSLR_jvm07765, openSLR_jvm07875, openSLR_jvm08178, openSLR_jvm09724, openSLR_sso0145, openSLR_sso0493, openSLR_sso0806, openSLR_sso1266, openSLR_sso1367, openSLR_sso1801, openSLR_sso2388, openSLR_sso2910, openSLR_sso4592, openSLR_sso5945, openSLR_sso6499, openSLR_sso7801, openSLR_sso7821, openSLR_sso7876, openSLR_sso7912, openSLR_sso7934, openSLR_sso8596, openSLR_sso8777, openSLR_sso9892, openSLR_suf00297, openSLR_suf00600, openSLR_suf00691, openSLR_suf00994, openSLR_suf01056, openSLR_suf01359, openSLR_suf02092, openSLR_suf02395, openSLR_suf02953, openSLR_suf03712, openSLR_suf03887, openSLR_suf04190, openSLR_suf04646, openSLR_suf04748, openSLR_suf05051, openSLR_suf05507, openSLR_suf06543, openSLR_suf07302, openSLR_suf08338, openSLR_suf08703, openSLR_sum00060, openSLR_sum00454, openSLR_sum01038, openSLR_sum01552, openSLR_sum01596, openSLR_sum01855, openSLR_sum01899, openSLR_sum02716, openSLR_sum03391, openSLR_sum03650, openSLR_sum03694, openSLR_sum04208, openSLR_sum04511, openSLR_sum05186, openSLR_sum06003, openSLR_sum06047, openSLR_sum07842, openSLR_sum08659, openSLR_sum09243, openSLR_sum09637, openSLR_sum09757, openSLR_tsn0045, openSLR_tsn0378, openSLR_tsn0441, openSLR_tsn1483, openSLR_tsn1498, openSLR_tsn1932, openSLR_tsn2839, openSLR_tsn3342, openSLR_tsn3629, openSLR_tsn4506, openSLR_tsn4850, openSLR_tsn5628, openSLR_tsn6116, openSLR_tsn6206, openSLR_tsn6234, openSLR_tsn6459, openSLR_tsn7674, openSLR_tsn7693, openSLR_tsn7866, openSLR_tsn7896, openSLR_tsn8333, openSLR_tsn8512, openSLR_tsn8532, openSLR_tsn8914, openSLR_tsn9061, openSLR_tsn9365, openSLR_xho0050, openSLR_xho0120, openSLR_xho1547, openSLR_xho3616, openSLR_xho4280, openSLR_xho4291, openSLR_xho5378, openSLR_xho5680, openSLR_xho6975, openSLR_xho7590, openSLR_xho7599, openSLR_xho9446, zeckou\n" - ] - } - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 4, - "source": [ - "#set speaker\n", - "d_vector = model.speaker_manager.get_mean_d_vector('VCTK_p260')" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 5, - "source": [ - "model.language_manager.language_id_mapping" - ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "{'af': 0,\n", - " 'en': 1,\n", - " 'fr-fr': 2,\n", - " 'jv': 3,\n", - " 'pt-br': 4,\n", - " 'st': 5,\n", - " 'su': 6,\n", - " 'tn': 7,\n", - " 'xh': 8}" - ] - }, - "metadata": {}, - "execution_count": 5 - } - ], - "metadata": { - "scrolled": true - } - }, - { - "cell_type": "code", - "execution_count": 6, - "source": [ - "# set scales \n", - "model.noise_scale = 0.0 # defines the noise variance applied to the random z vector at inference.\n", - "model.length_scale = 1.0 # scaler for the duration predictor. The larger it is, the slower the speech.\n", - "model.noise_scale_w = 0.0 # defines the noise variance applied to the duration predictor z vector at inference.\n", - "model.inference_noise_scale = 0.5 # defines the noise variance applied to the random z vector at inference.\n", - "model.inference_noise_scale_dp = 0.6 # defines the noise variance applied to the duration predictor z vector at inference." - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 7, - "source": [ - "text = \"Il m'a fallu beaucoup de temps pour développer une voix, et maintenant que je l'ai, je ne vais pas me taire.\"\n", - "language_id = 2\n", - "wav, alignment, _, _ = synthesis(\n", - " model,\n", - " text,\n", - " C,\n", - " \"cuda\" in str(next(model.parameters()).device),\n", - " ap,\n", - " speaker_id=None,\n", - " d_vector=d_vector,\n", - " style_wav=None,\n", - " language_id=language_id,\n", - " enable_eos_bos_chars=C.enable_eos_bos_chars,\n", - " use_griffin_lim=True,\n", - " do_trim_silence=False,\n", - " ).values()\n", - "IPython.display.display(Audio(wav, rate=ap.sample_rate))" - ], - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {} - } - ], - "metadata": {} - } - ], - "metadata": { - "interpreter": { - "hash": "b925b73899c1545aa2d9bbcf4e8e1df4138a367d2daefc2707570579325ca4c0" - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3.8.10 64-bit ('TTS': conda)" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file From 90eac13bb222e4188be638cd09932a39badc906f Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 18:42:54 -0300 Subject: [PATCH 164/220] Rename ununsed_speakers to ignored_speakers --- TTS/config/shared_configs.py | 2 +- TTS/tts/datasets/__init__.py | 7 +++--- TTS/tts/datasets/formatters.py | 42 +++++++++++++++++----------------- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index f1ea2e0f..c52cfe8a 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -198,7 +198,7 @@ class BaseDatasetConfig(Coqpit): name: str = "" path: str = "" meta_file_train: str = "" - ununsed_speakers: List[str] = None + ignored_speakers: List[str] = None language: str = "" meta_file_val: str = "" meta_file_attn_mask: str = "" diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index 3673e188..40eed7e3 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -67,22 +67,21 @@ def load_tts_samples( root_path = dataset["path"] meta_file_train = dataset["meta_file_train"] meta_file_val = dataset["meta_file_val"] - ununsed_speakers = dataset["ununsed_speakers"] + ignored_speakers = dataset["ignored_speakers"] language = dataset["language"] # setup the right data processor if formatter is None: formatter = _get_formatter_by_name(name) # load train set - meta_data_train = formatter(root_path, meta_file_train, ununsed_speakers=ununsed_speakers) - # TODO: remove the loops and pass language as a parameter to preprocessor for faster load + meta_data_train = formatter(root_path, meta_file_train, ignored_speakers=ignored_speakers) meta_data_train = [[*item, language] for item in meta_data_train] print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}") # load evaluation split if set if eval_split: if meta_file_val: - meta_data_eval = formatter(root_path, meta_file_val, ununsed_speakers=ununsed_speakers) + meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers) meta_data_eval = [[*item, language] for item in meta_data_eval] else: meta_data_eval, meta_data_train = split_dataset(meta_data_train) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 49a1ced4..1f23f85e 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -59,7 +59,7 @@ def mozilla_de(root_path, meta_file, **kwargs): # pylint: disable=unused-argume return items -def mailabs(root_path, meta_files=None, ununsed_speakers=None): +def mailabs(root_path, meta_files=None, ignored_speakers=None): """Normalizes M-AI-Labs meta data files to TTS format Args: @@ -88,8 +88,8 @@ def mailabs(root_path, meta_files=None, ununsed_speakers=None): continue speaker_name = speaker_name_match.group("speaker_name") # ignore speakers - if isinstance(ununsed_speakers, list): - if speaker_name in ununsed_speakers: + if isinstance(ignored_speakers, list): + if speaker_name in ignored_speakers: continue print(" | > {}".format(csv_file)) with open(txt_file, "r", encoding="utf-8") as ttf: @@ -197,7 +197,7 @@ def nancy(root_path, meta_file, **kwargs): # pylint: disable=unused-argument return items -def common_voice(root_path, meta_file, ununsed_speakers=None): +def common_voice(root_path, meta_file, ignored_speakers=None): """Normalize the common voice meta data file to TTS format.""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -209,15 +209,15 @@ def common_voice(root_path, meta_file, ununsed_speakers=None): text = cols[2] speaker_name = cols[0] # ignore speakers - if isinstance(ununsed_speakers, list): - if speaker_name in ununsed_speakers: + if isinstance(ignored_speakers, list): + if speaker_name in ignored_speakers: continue wav_file = os.path.join(root_path, "clips", cols[1].replace(".mp3", ".wav")) items.append([text, wav_file, "MCV_" + speaker_name]) return items -def libri_tts(root_path, meta_files=None, ununsed_speakers=None): +def libri_tts(root_path, meta_files=None, ignored_speakers=None): """https://ai.google/tools/datasets/libri-tts/""" items = [] if not meta_files: @@ -237,8 +237,8 @@ def libri_tts(root_path, meta_files=None, ununsed_speakers=None): wav_file = os.path.join(_root_path, file_name + ".wav") text = cols[2] # ignore speakers - if isinstance(ununsed_speakers, list): - if speaker_name in ununsed_speakers: + if isinstance(ignored_speakers, list): + if speaker_name in ignored_speakers: continue items.append([text, wav_file, "LTTS_" + speaker_name]) for item in items: @@ -265,7 +265,7 @@ def custom_turkish(root_path, meta_file, **kwargs): # pylint: disable=unused-ar # ToDo: add the dataset link when the dataset is released publicly -def brspeech(root_path, meta_file, ununsed_speakers=None): +def brspeech(root_path, meta_file, ignored_speakers=None): """BRSpeech 3.0 beta""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -278,14 +278,14 @@ def brspeech(root_path, meta_file, ununsed_speakers=None): text = cols[2] speaker_id = cols[3] # ignore speakers - if isinstance(ununsed_speakers, list): - if speaker_id in ununsed_speakers: + if isinstance(ignored_speakers, list): + if speaker_id in ignored_speakers: continue items.append([text, wav_file, speaker_id]) return items -def vctk(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): +def vctk(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None): """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" items = [] meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) @@ -293,8 +293,8 @@ def vctk(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) file_id = txt_file.split(".")[0] # ignore speakers - if isinstance(ununsed_speakers, list): - if speaker_id in ununsed_speakers: + if isinstance(ignored_speakers, list): + if speaker_id in ignored_speakers: continue with open(meta_file, "r", encoding="utf-8") as file_text: text = file_text.readlines()[0] @@ -304,7 +304,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): return items -def vctk_slim(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=None): # pylint: disable=unused-argument +def vctk_slim(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None): # pylint: disable=unused-argument """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" items = [] txt_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) @@ -312,8 +312,8 @@ def vctk_slim(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=No _, speaker_id, txt_file = os.path.relpath(text_file, root_path).split(os.sep) file_id = txt_file.split(".")[0] # ignore speakers - if isinstance(ununsed_speakers, list): - if speaker_id in ununsed_speakers: + if isinstance(ignored_speakers, list): + if speaker_id in ignored_speakers: continue wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav") items.append([None, wav_file, "VCTK_" + speaker_id]) @@ -321,7 +321,7 @@ def vctk_slim(root_path, meta_files=None, wavs_path="wav48", ununsed_speakers=No return items -def mls(root_path, meta_files=None, ununsed_speakers=None): +def mls(root_path, meta_files=None, ignored_speakers=None): """http://www.openslr.org/94/""" items = [] with open(os.path.join(root_path, meta_files), "r", encoding="utf-8") as meta: @@ -331,8 +331,8 @@ def mls(root_path, meta_files=None, ununsed_speakers=None): speaker, book, *_ = file.split("_") wav_file = os.path.join(root_path, os.path.dirname(meta_files), "audio", speaker, book, file + ".wav") # ignore speakers - if isinstance(ununsed_speakers, list): - if speaker in ununsed_speakers: + if isinstance(ignored_speakers, list): + if speaker in ignored_speakers: continue items.append([text, wav_file, "MLS_" + speaker]) return items From f394d606956064b290dfbaa0788aaa77cab191e7 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 18:43:56 -0300 Subject: [PATCH 165/220] Fix the bug in multispeaker vits --- TTS/bin/train_tts.py | 9 +++++++++ tests/tts_tests/test_vits_multilingual_train.py | 6 ++++++ 2 files changed, 15 insertions(+) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index e28e9dec..f39ed259 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -47,8 +47,17 @@ def main(): # init speaker manager if config.use_speaker_embedding: speaker_manager = SpeakerManager(data_items=train_samples + eval_samples) + if hasattr(config, "model_args"): + config.model_args.num_speakers = len(speaker_manager.speaker_ids) + else: + config.num_speakers = len(speaker_manager.speaker_ids) + elif config.use_d_vector_file: speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file) + if hasattr(config, "model_args"): + config.model_args.num_speakers = len(speaker_manager.speaker_ids) + else: + config.num_speakers = len(speaker_manager.speaker_ids) else: speaker_manager = None diff --git a/tests/tts_tests/test_vits_multilingual_train.py b/tests/tts_tests/test_vits_multilingual_train.py index 04b42e61..a280e8c5 100644 --- a/tests/tts_tests/test_vits_multilingual_train.py +++ b/tests/tts_tests/test_vits_multilingual_train.py @@ -53,9 +53,15 @@ config.audio.trim_db = 60 # active multilingual mode config.model_args.use_language_embedding = True +config.use_language_embedding = True # active multispeaker mode config.model_args.use_speaker_embedding = True +config.use_speaker_embedding = True +# config.num_speakers=1 +# config.model_args.num_speakers=1 + config.model_args.use_d_vector_file = False +config.use_d_vector_file = False # active language sampler config.use_language_weighted_sampler = True From 4196a42de76f753ead8ca5c408844a933014d463 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 18:52:59 -0300 Subject: [PATCH 166/220] Get the number speaker from the Speaker Manager property --- TTS/bin/train_tts.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index f39ed259..a543a947 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -48,16 +48,15 @@ def main(): if config.use_speaker_embedding: speaker_manager = SpeakerManager(data_items=train_samples + eval_samples) if hasattr(config, "model_args"): - config.model_args.num_speakers = len(speaker_manager.speaker_ids) + config.model_args.num_speakers = speaker_manager.num_speakers else: - config.num_speakers = len(speaker_manager.speaker_ids) - + config.num_speakers = speaker_manager.num_speakers elif config.use_d_vector_file: speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file) if hasattr(config, "model_args"): - config.model_args.num_speakers = len(speaker_manager.speaker_ids) + config.model_args.num_speakers = speaker_manager.num_speakers else: - config.num_speakers = len(speaker_manager.speaker_ids) + config.num_speakers = speaker_manager.num_speakers else: speaker_manager = None From 12968532fe57f87e6c294fd0de96004ffcae7d98 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 20:02:05 -0300 Subject: [PATCH 167/220] Add the language embedding dim in the duration predictor class --- TTS/tts/layers/glow_tts/duration_predictor.py | 5 +++++ TTS/tts/layers/vits/stochastic_duration_predictor.py | 6 +++++- TTS/tts/models/vits.py | 4 ++-- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/TTS/tts/layers/glow_tts/duration_predictor.py b/TTS/tts/layers/glow_tts/duration_predictor.py index f46c73a9..e766ed6a 100644 --- a/TTS/tts/layers/glow_tts/duration_predictor.py +++ b/TTS/tts/layers/glow_tts/duration_predictor.py @@ -20,6 +20,11 @@ class DurationPredictor(nn.Module): def __init__(self, in_channels, hidden_channels, kernel_size, dropout_p, cond_channels=None, language_emb_dim=None): super().__init__() + + # add language embedding dim in the input + if language_emb_dim: + in_channels += language_emb_dim + # class arguments self.in_channels = in_channels self.filter_channels = hidden_channels diff --git a/TTS/tts/layers/vits/stochastic_duration_predictor.py b/TTS/tts/layers/vits/stochastic_duration_predictor.py index 7c25156a..120d0944 100644 --- a/TTS/tts/layers/vits/stochastic_duration_predictor.py +++ b/TTS/tts/layers/vits/stochastic_duration_predictor.py @@ -185,10 +185,14 @@ class StochasticDurationPredictor(nn.Module): dropout_p: float, num_flows=4, cond_channels=0, - language_emb_dim=None, + language_emb_dim=0, ): super().__init__() + # add language embedding dim in the input + if language_emb_dim: + in_channels += language_emb_dim + # condition encoder text self.pre = nn.Conv1d(in_channels, hidden_channels, 1) self.convs = DilatedDepthSeparableConv(hidden_channels, kernel_size, num_layers=3, dropout_p=dropout_p) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index ccd742b1..6b1dd325 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -321,7 +321,7 @@ class Vits(BaseTTS): if args.use_sdp: self.duration_predictor = StochasticDurationPredictor( - args.hidden_channels + self.embedded_language_dim, + args.hidden_channels, 192, 3, args.dropout_p_duration_predictor, @@ -331,7 +331,7 @@ class Vits(BaseTTS): ) else: self.duration_predictor = DurationPredictor( - args.hidden_channels + self.embedded_language_dim, + args.hidden_channels, 256, 3, args.dropout_p_duration_predictor, From 06d89f93a833b36c869deea369621e5d4ae35f93 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 22 Nov 2021 20:19:37 -0300 Subject: [PATCH 168/220] Add VITS multilingual d-vectors unit test --- .../test_vits_multilingual_train-d_vectors.py | 93 +++++++++++++++++++ .../tts_tests/test_vits_multilingual_train.py | 8 +- 2 files changed, 99 insertions(+), 2 deletions(-) create mode 100644 tests/tts_tests/test_vits_multilingual_train-d_vectors.py diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py new file mode 100644 index 00000000..f426e383 --- /dev/null +++ b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py @@ -0,0 +1,93 @@ +import glob +import os +import shutil + +from tests import get_device_id, get_tests_output_path, run_cli +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.configs.vits_config import VitsConfig + +config_path = os.path.join(get_tests_output_path(), "test_model_config.json") +output_path = os.path.join(get_tests_output_path(), "train_outputs") + + +dataset_config1 = BaseDatasetConfig( + name="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="en", +) + +dataset_config2 = BaseDatasetConfig( + name="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="en2", +) + +config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + use_espeak_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + ["Be a voice, not an echo.", "ljspeech-0", None, "en"], + ["Be a voice, not an echo.", "ljspeech-1", None, "en2"], + ], + datasets=[dataset_config1, dataset_config2], +) +# set audio config +config.audio.do_trim_silence = True +config.audio.trim_db = 60 + +# active multilingual mode +config.model_args.use_language_embedding = True +config.use_language_embedding = True + +# deactivate multispeaker mode +config.model_args.use_speaker_embedding = False +config.use_speaker_embedding = False + +# active multispeaker d-vec mode +config.model_args.use_d_vector_file = True +config.use_d_vector_file = True +config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json" +config.d_vector_file = "tests/data/ljspeech/speakers.json" +config.model_args.d_vector_dim = 256 +config.d_vector_dim = 256 + +# duration predictor +config.model_args.use_sdp = True +config.use_sdp = True + +# deactivate language sampler +config.use_language_weighted_sampler = False + +config.save_json(config_path) + +# train the model for one epoch +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.test_delay_epochs 0" +) +run_cli(command_train) + +# Find latest folder +continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + +# restore the model and continue training for one more epoch +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " +run_cli(command_train) +shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_vits_multilingual_train.py b/tests/tts_tests/test_vits_multilingual_train.py index a280e8c5..90f589d0 100644 --- a/tests/tts_tests/test_vits_multilingual_train.py +++ b/tests/tts_tests/test_vits_multilingual_train.py @@ -57,11 +57,15 @@ config.use_language_embedding = True # active multispeaker mode config.model_args.use_speaker_embedding = True config.use_speaker_embedding = True -# config.num_speakers=1 -# config.model_args.num_speakers=1 +# deactivate multispeaker d-vec mode config.model_args.use_d_vector_file = False config.use_d_vector_file = False + +# duration predictor +config.model_args.use_sdp = False +config.use_sdp = False + # active language sampler config.use_language_weighted_sampler = True From ffc269eaf4b679d92624e7513c24971e7f7ab54e Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 23 Nov 2021 14:50:19 +0100 Subject: [PATCH 169/220] Update docstring --- TTS/tts/models/vits.py | 18 +++++++++++++++++- TTS/tts/utils/synthesis.py | 6 ++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 6b1dd325..a9d00213 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -167,6 +167,20 @@ class VitsArgs(Coqpit): speaker_encoder_model_path (str): Path to the file speaker encoder checkpoint file, to use for SCL. Defaults to "". + freeze_encoder (bool): + Freeze the encoder weigths during training. Defaults to False. + + freeze_DP (bool): + Freeze the duration predictor weigths during training. Defaults to False. + + freeze_PE (bool): + Freeze the posterior encoder weigths during training. Defaults to False. + + freeze_flow_encoder (bool): + Freeze the flow encoder weigths during training. Defaults to False. + + freeze_waveform_decoder (bool): + Freeze the waveform decoder weigths during training. Defaults to False. """ num_chars: int = 100 @@ -555,7 +569,8 @@ class Vits(BaseTTS): x_lengths (torch.tensor): Batch of input character sequence lengths. y (torch.tensor): Batch of input spectrograms. y_lengths (torch.tensor): Batch of input spectrogram lengths. - aux_input (dict, optional): Auxiliary inputs for multi-speaker training. Defaults to {"d_vectors": None, "speaker_ids": None}. + aux_input (dict, optional): Auxiliary inputs for multi-speaker and multi-lingual training. + Defaults to {"d_vectors": None, "speaker_ids": None, "language_ids": None}. Returns: Dict: model outputs keyed by the output name. @@ -567,6 +582,7 @@ class Vits(BaseTTS): - y_lengths: :math:`[B]` - d_vectors: :math:`[B, C, 1]` - speaker_ids: :math:`[B]` + - language_ids: :math:`[B]` """ outputs = {} sid, g, lid = self._set_cond_input(aux_input) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 102914c5..24b747be 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -249,6 +249,12 @@ def synthesis( d_vector (torch.Tensor): d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None. + language_id (int): + Language ID passed to the language embedding layer in multi-langual model. Defaults to None. + + language_name (str): + Language name corresponding to the language code used by the phonemizer. Defaults to None. + backend (str): tf or torch. Defaults to "torch". """ From 2e516869a1da2463d3a395ec7fc7de05be18f4ed Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 23 Nov 2021 15:24:03 +0100 Subject: [PATCH 170/220] Fix trailing whitespace --- TTS/tts/models/vits.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index a9d00213..ac0f5d69 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -569,7 +569,7 @@ class Vits(BaseTTS): x_lengths (torch.tensor): Batch of input character sequence lengths. y (torch.tensor): Batch of input spectrograms. y_lengths (torch.tensor): Batch of input spectrogram lengths. - aux_input (dict, optional): Auxiliary inputs for multi-speaker and multi-lingual training. + aux_input (dict, optional): Auxiliary inputs for multi-speaker and multi-lingual training. Defaults to {"d_vectors": None, "speaker_ids": None, "language_ids": None}. Returns: @@ -592,7 +592,7 @@ class Vits(BaseTTS): # language embedding lang_emb = None - if self.args.use_language_embedding and lid is not None: + if hasattr(self, "emb_l"): lang_emb = self.emb_l(lid).unsqueeze(-1) x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb) From d39200e69b29b107302dcfc124a17faf0c0f6cdd Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 23 Nov 2021 11:24:36 -0300 Subject: [PATCH 171/220] Remove torchaudio requeriment --- TTS/speaker_encoder/models/resnet.py | 26 ++++++++++++++++++++++---- TTS/tts/models/vits.py | 21 ++++++++++++--------- TTS/utils/audio.py | 12 +++++++++++- requirements.txt | 2 -- 4 files changed, 45 insertions(+), 16 deletions(-) diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index 47b6f23f..8f0a8809 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -1,7 +1,10 @@ import numpy as np import torch -import torchaudio -import torch.nn as nn +from torch import nn + +# import torchaudio + +from TTS.utils.audio import TorchSTFT from TTS.utils.io import load_fsspec @@ -110,14 +113,29 @@ class ResNetSpeakerEncoder(nn.Module): if self.use_torch_spec: self.torch_spec = torch.nn.Sequential( PreEmphasis(audio_config["preemphasis"]), - torchaudio.transforms.MelSpectrogram( + TorchSTFT( + n_fft=audio_config["fft_size"], + hop_length=audio_config["hop_length"], + win_length=audio_config["win_length"], + sample_rate=audio_config["sample_rate"], + window="hamming_window", + mel_fmin=0.0, + mel_fmax=None, + use_htk=True, + do_amp_to_db=False, + n_mels=audio_config["num_mels"], + power=2.0, + use_mel=True, + mel_norm=None + ), + '''torchaudio.transforms.MelSpectrogram( sample_rate=audio_config["sample_rate"], n_fft=audio_config["fft_size"], win_length=audio_config["win_length"], hop_length=audio_config["hop_length"], window_fn=torch.hamming_window, n_mels=audio_config["num_mels"], - ), + ),''' ) else: self.torch_spec = None diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index ac0f5d69..4eb12b3b 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -4,7 +4,7 @@ from itertools import chain from typing import Dict, List, Tuple import torch -import torchaudio +# import torchaudio from coqpit import Coqpit from torch import nn from torch.cuda.amp.autocast_mode import autocast @@ -395,7 +395,7 @@ class Vits(BaseTTS): if config.use_speaker_encoder_as_loss: if not config.speaker_encoder_model_path or not config.speaker_encoder_config_path: raise RuntimeError( - " [!] To use the speaker encoder loss you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!" + " [!] To use the speaker consistency loss (SCL) you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!" ) self.speaker_manager.init_speaker_encoder( config.speaker_encoder_model_path, config.speaker_encoder_config_path @@ -410,14 +410,17 @@ class Vits(BaseTTS): hasattr(self.speaker_encoder, "audio_config") and self.audio_config["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"] ): - self.audio_transform = torchaudio.transforms.Resample( + raise RuntimeError( + " [!] To use the speaker consistency loss (SCL) you need to have the TTS model sampling rate ({}) equal to the speaker encoder sampling rate ({}) !".format(self.audio_config["sample_rate"], self.speaker_encoder.audio_config["sample_rate"]) + ) + '''self.audio_transform = torchaudio.transforms.Resample( orig_freq=self.audio_config["sample_rate"], new_freq=self.speaker_encoder.audio_config["sample_rate"], - ) - else: - self.audio_transform = None + ) + else: + self.audio_transform = None''' else: - self.audio_transform = None + # self.audio_transform = None self.speaker_encoder = None def _init_speaker_embedding(self, config): @@ -655,8 +658,8 @@ class Vits(BaseTTS): wavs_batch = torch.cat((wav_seg, o), dim=0).squeeze(1) # resample audio to speaker encoder sample_rate - if self.audio_transform is not None: - wavs_batch = self.audio_transform(wavs_batch) + '''if self.audio_transform is not None: + wavs_batch = self.audio_transform(wavs_batch)''' pred_embs = self.speaker_encoder.forward(wavs_batch, l2_norm=True) diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index e64b95e0..d650c288 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -32,6 +32,9 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method use_mel=False, do_amp_to_db=False, spec_gain=1.0, + power=None, + use_htk=False, + mel_norm="slaney" ): super().__init__() self.n_fft = n_fft @@ -45,6 +48,9 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method self.use_mel = use_mel self.do_amp_to_db = do_amp_to_db self.spec_gain = spec_gain + self.power = power + self.use_htk = use_htk + self.mel_norm = mel_norm self.window = nn.Parameter(getattr(torch, window)(win_length), requires_grad=False) self.mel_basis = None if use_mel: @@ -83,6 +89,10 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method M = o[:, :, :, 0] P = o[:, :, :, 1] S = torch.sqrt(torch.clamp(M ** 2 + P ** 2, min=1e-8)) + + if self.power is not None: + S = S ** self.power + if self.use_mel: S = torch.matmul(self.mel_basis.to(x), S) if self.do_amp_to_db: @@ -91,7 +101,7 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method def _build_mel_basis(self): mel_basis = librosa.filters.mel( - self.sample_rate, self.n_fft, n_mels=self.n_mels, fmin=self.mel_fmin, fmax=self.mel_fmax + self.sample_rate, self.n_fft, n_mels=self.n_mels, fmin=self.mel_fmin, fmax=self.mel_fmax, htk=self.use_htk, norm=self.mel_norm ) self.mel_basis = torch.from_numpy(mel_basis).float() diff --git a/requirements.txt b/requirements.txt index cf4798b2..3ec33ceb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,5 +26,3 @@ unidic-lite==1.0.8 gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=2.0.0 fsspec>=2021.04.0 pyworld -webrtcvad -torchaudio>=0.7 From 4cd0e4eb0d3dbd4621b9b5125aeb4c2ba6427bf5 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 23 Nov 2021 15:40:03 +0100 Subject: [PATCH 172/220] Remove self.audio_config from VITS --- TTS/tts/models/vits.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 4eb12b3b..6acf2000 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -270,7 +270,6 @@ class Vits(BaseTTS): super().__init__(config) self.END2END = True - self.speaker_manager = speaker_manager self.audio_config = config["audio"] if config.__class__.__name__ == "VitsConfig": @@ -408,7 +407,7 @@ class Vits(BaseTTS): if ( hasattr(self.speaker_encoder, "audio_config") - and self.audio_config["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"] + and self.config.audio["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"] ): raise RuntimeError( " [!] To use the speaker consistency loss (SCL) you need to have the TTS model sampling rate ({}) equal to the speaker encoder sampling rate ({}) !".format(self.audio_config["sample_rate"], self.speaker_encoder.audio_config["sample_rate"]) From 78a23e19df22e7b3f1c7b2564c4751ce8267709d Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 23 Nov 2021 11:43:08 -0300 Subject: [PATCH 173/220] Fix pylint checks --- TTS/tts/models/vits.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 6acf2000..cc86e119 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -412,12 +412,14 @@ class Vits(BaseTTS): raise RuntimeError( " [!] To use the speaker consistency loss (SCL) you need to have the TTS model sampling rate ({}) equal to the speaker encoder sampling rate ({}) !".format(self.audio_config["sample_rate"], self.speaker_encoder.audio_config["sample_rate"]) ) - '''self.audio_transform = torchaudio.transforms.Resample( - orig_freq=self.audio_config["sample_rate"], - new_freq=self.speaker_encoder.audio_config["sample_rate"], - ) - else: - self.audio_transform = None''' + # pylint: disable=W0101,W0105 + """ self.audio_transform = torchaudio.transforms.Resample( + orig_freq=self.audio_config["sample_rate"], + new_freq=self.speaker_encoder.audio_config["sample_rate"], + ) + else: + self.audio_transform = None + """ else: # self.audio_transform = None self.speaker_encoder = None @@ -657,8 +659,9 @@ class Vits(BaseTTS): wavs_batch = torch.cat((wav_seg, o), dim=0).squeeze(1) # resample audio to speaker encoder sample_rate - '''if self.audio_transform is not None: - wavs_batch = self.audio_transform(wavs_batch)''' + # pylint: disable=W0105 + """if self.audio_transform is not None: + wavs_batch = self.audio_transform(wavs_batch)""" pred_embs = self.speaker_encoder.forward(wavs_batch, l2_norm=True) From 09eda31a3f5e7aa5b27cc53d34437cc41fbe1f00 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 23 Nov 2021 16:00:38 +0100 Subject: [PATCH 174/220] Fix tests --- TTS/tts/models/vits.py | 2 +- .../test_vits_multilingual_train-d_vectors.py | 10 +++++----- tests/tts_tests/test_vits_multilingual_train.py | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index cc86e119..1b6d29d4 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -596,7 +596,7 @@ class Vits(BaseTTS): # language embedding lang_emb = None - if hasattr(self, "emb_l"): + if self.args.use_language_embedding and lid is not None: lang_emb = self.emb_l(lid).unsqueeze(-1) x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb) diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py index f426e383..0e9827f1 100644 --- a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py +++ b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py @@ -10,7 +10,7 @@ config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") -dataset_config1 = BaseDatasetConfig( +dataset_config_en = BaseDatasetConfig( name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", @@ -18,12 +18,12 @@ dataset_config1 = BaseDatasetConfig( language="en", ) -dataset_config2 = BaseDatasetConfig( +dataset_config_pt = BaseDatasetConfig( name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", path="tests/data/ljspeech", - language="en2", + language="pt-br", ) config = VitsConfig( @@ -43,9 +43,9 @@ config = VitsConfig( print_eval=True, test_sentences=[ ["Be a voice, not an echo.", "ljspeech-0", None, "en"], - ["Be a voice, not an echo.", "ljspeech-1", None, "en2"], + ["Be a voice, not an echo.", "ljspeech-1", None, "pt-br"], ], - datasets=[dataset_config1, dataset_config2], + datasets=[dataset_config_en, dataset_config_pt], ) # set audio config config.audio.do_trim_silence = True diff --git a/tests/tts_tests/test_vits_multilingual_train.py b/tests/tts_tests/test_vits_multilingual_train.py index 90f589d0..50cccca5 100644 --- a/tests/tts_tests/test_vits_multilingual_train.py +++ b/tests/tts_tests/test_vits_multilingual_train.py @@ -10,7 +10,7 @@ config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") -dataset_config1 = BaseDatasetConfig( +dataset_config_en = BaseDatasetConfig( name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", @@ -18,12 +18,12 @@ dataset_config1 = BaseDatasetConfig( language="en", ) -dataset_config2 = BaseDatasetConfig( +dataset_config_pt = BaseDatasetConfig( name="ljspeech", meta_file_train="metadata.csv", meta_file_val="metadata.csv", path="tests/data/ljspeech", - language="en2", + language="pt-br", ) config = VitsConfig( @@ -43,9 +43,9 @@ config = VitsConfig( print_eval=True, test_sentences=[ ["Be a voice, not an echo.", "ljspeech", None, "en"], - ["Be a voice, not an echo.", "ljspeech", None, "en2"], + ["Be a voice, not an echo.", "ljspeech", None, "pt-br"], ], - datasets=[dataset_config1, dataset_config2], + datasets=[dataset_config_en, dataset_config_pt], ) # set audio config config.audio.do_trim_silence = True From 8b52fb89d1bdb519983e3c1e4521704f97c70416 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 23 Nov 2021 16:25:21 +0100 Subject: [PATCH 175/220] Fix merge bug --- TTS/tts/models/vits.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 1b6d29d4..8ccbce36 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -410,7 +410,8 @@ class Vits(BaseTTS): and self.config.audio["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"] ): raise RuntimeError( - " [!] To use the speaker consistency loss (SCL) you need to have the TTS model sampling rate ({}) equal to the speaker encoder sampling rate ({}) !".format(self.audio_config["sample_rate"], self.speaker_encoder.audio_config["sample_rate"]) + ' [!] To use the speaker consistency loss (SCL) you need to have matching sample rates between the TTS model ({}) and the speaker encoder ({})!' + .format(self.config.audio["sample_rate"], self.speaker_encoder.audio_config["sample_rate"]) ) # pylint: disable=W0101,W0105 """ self.audio_transform = torchaudio.transforms.Resample( From 54e33bff61d57e868d7be3b20c9bd08b27c31f84 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 23 Nov 2021 16:29:13 +0100 Subject: [PATCH 176/220] Make a multilingual test use chars --- tests/tts_tests/test_vits_multilingual_train-d_vectors.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py index 0e9827f1..1ca57d93 100644 --- a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py +++ b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py @@ -32,9 +32,7 @@ config = VitsConfig( num_loader_workers=0, num_eval_loader_workers=0, text_cleaner="english_cleaners", - use_phonemes=True, - use_espeak_phonemes=True, - phoneme_language="en-us", + use_phonemes=False, phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, test_delay_epochs=-1, From 67dda0abe1f3c74aca84f7bd08bebeaa927827ab Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 23 Nov 2021 13:37:14 -0300 Subject: [PATCH 177/220] Add the SCL resample TODO --- TTS/tts/models/vits.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 8ccbce36..c1c29980 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -409,6 +409,7 @@ class Vits(BaseTTS): hasattr(self.speaker_encoder, "audio_config") and self.config.audio["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"] ): + # TODO: change this with torchaudio Resample raise RuntimeError( ' [!] To use the speaker consistency loss (SCL) you need to have matching sample rates between the TTS model ({}) and the speaker encoder ({})!' .format(self.config.audio["sample_rate"], self.speaker_encoder.audio_config["sample_rate"]) @@ -418,8 +419,8 @@ class Vits(BaseTTS): orig_freq=self.audio_config["sample_rate"], new_freq=self.speaker_encoder.audio_config["sample_rate"], ) - else: - self.audio_transform = None + else: + self.audio_transform = None """ else: # self.audio_transform = None From e068fab6b284296b1884c1be17a03a2f735d2642 Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 26 Nov 2021 17:14:29 -0300 Subject: [PATCH 178/220] Add find unique phonemes unit tests --- tests/aux_tests/test_find_unique_phonemes.py | 84 ++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 tests/aux_tests/test_find_unique_phonemes.py diff --git a/tests/aux_tests/test_find_unique_phonemes.py b/tests/aux_tests/test_find_unique_phonemes.py new file mode 100644 index 00000000..33fad9ba --- /dev/null +++ b/tests/aux_tests/test_find_unique_phonemes.py @@ -0,0 +1,84 @@ +import os +import unittest + +import torch + +from tests import get_tests_output_path, run_cli +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.configs.vits_config import VitsConfig + +torch.manual_seed(1) + +config_path = os.path.join(get_tests_output_path(), "test_model_config.json") + +dataset_config_en = BaseDatasetConfig( + name="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="en", +) + +dataset_config_pt = BaseDatasetConfig( + name="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="pt-br", +) + +# pylint: disable=protected-access +class TestFindUniquePhonemes(unittest.TestCase): + @staticmethod + def test_espeak_phonemes(): + # prepare the config + config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + use_espeak_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + datasets=[dataset_config_en, dataset_config_pt], + ) + config.save_json(config_path) + + # run test + run_cli( + f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"' + ) + + @staticmethod + def test_no_espeak_phonemes(): + # prepare the config + config = VitsConfig( + batch_size=2, + eval_batch_size=2, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=True, + use_espeak_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + datasets=[dataset_config_en, dataset_config_pt], + ) + config.save_json(config_path) + + # run test + run_cli( + f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"' + ) From a57ddfb4ec789118e8a15058214cd51953e91553 Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 26 Nov 2021 17:14:58 -0300 Subject: [PATCH 179/220] Add remove silence vad script Unit test --- .../test_remove_silence_vad_script.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 tests/aux_tests/test_remove_silence_vad_script.py diff --git a/tests/aux_tests/test_remove_silence_vad_script.py b/tests/aux_tests/test_remove_silence_vad_script.py new file mode 100644 index 00000000..c934e065 --- /dev/null +++ b/tests/aux_tests/test_remove_silence_vad_script.py @@ -0,0 +1,29 @@ +import os +import unittest + +import torch + +from tests import get_tests_input_path, get_tests_output_path, run_cli + +torch.manual_seed(1) + +# pylint: disable=protected-access +class TestRemoveSilenceVAD(unittest.TestCase): + @staticmethod + def test(): + # set paths + wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs") + output_path = os.path.join(get_tests_output_path(), "output_wavs_removed_silence/") + output_resample_path = os.path.join(get_tests_output_path(), "output_ljspeech_16khz/") + + # resample audios + run_cli( + f'CUDA_VISIBLE_DEVICES="" python TTS/bin/resample.py --input_dir "{wav_path}" --output_dir "{output_resample_path}" --output_sr 16000' + ) + + # run test + run_cli( + f'CUDA_VISIBLE_DEVICES="" python TTS/bin/remove_silence_using_vad.py --input_dir "{output_resample_path}" --output_dir "{output_path}"' + ) + run_cli(f'rm -rf "{output_resample_path}"') + run_cli(f'rm -rf "{output_path}"') From 818dc4ccd8081af9e2ae5af93a75900ce3cc31eb Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 26 Nov 2021 17:42:26 -0300 Subject: [PATCH 180/220] Add Docstring for TorchSTFT --- TTS/utils/audio.py | 56 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index d650c288..10c9ec7e 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -16,6 +16,60 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method """Some of the audio processing funtions using Torch for faster batch processing. TODO: Merge this with audio.py + + Args: + + n_fft (int): + FFT window size for STFT. + + hop_length (int): + number of frames between STFT columns. + + win_length (int, optional): + STFT window length. + + pad_wav (bool, optional): + If True pad the audio with (n_fft - hop_length) / 2). Defaults to False. + + window (str, optional): + The name of a function to create a window tensor that is applied/multiplied to each frame/window. Defaults to "hann_window" + + sample_rate (int, optional): + target audio sampling rate. Defaults to None. + + mel_fmin (int, optional): + minimum filter frequency for computing melspectrograms. Defaults to None. + + mel_fmax (int, optional): + maximum filter frequency for computing melspectrograms. Defaults to None. + + n_mels (int, optional): + number of melspectrogram dimensions. Defaults to None. + + use_mel (bool, optional): + If True compute the melspectrograms otherwise. Defaults to False. + + do_amp_to_db_linear (bool, optional): + enable/disable amplitude to dB conversion of linear spectrograms. Defaults to False. + + spec_gain (float, optional): + gain applied when converting amplitude to DB. Defaults to 1.0. + + power (float, optional): + Exponent for the magnitude spectrogram, e.g., 1 for energy, 2 for power, etc. Defaults to None. + + use_htk (bool, optional): + Use HTK formula in mel filter instead of Slaney. + + mel_norm (None, 'slaney', or number, optional): + If 'slaney', divide the triangular mel weights by the width of the mel band + (area normalization). + + If numeric, use `librosa.util.normalize` to normalize each filter by to unit l_p norm. + See `librosa.util.normalize` for a full description of supported norm values + (including `+-np.inf`). + + Otherwise, leave all the triangles aiming for a peak value of 1.0. Defaults to "slaney". """ def __init__( @@ -177,7 +231,7 @@ class AudioProcessor(object): minimum filter frequency for computing melspectrograms. Defaults to None. mel_fmax (int, optional): - maximum filter frequency for computing melspectrograms.. Defaults to None. + maximum filter frequency for computing melspectrograms. Defaults to None. spec_gain (int, optional): gain applied when converting amplitude to DB. Defaults to 20. From 6b03943526a4823ce70fc2bca164642d3f76178c Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Sat, 27 Nov 2021 22:55:21 +0100 Subject: [PATCH 181/220] Move multilingual logic out of the trainer --- TTS/bin/train_tts.py | 12 +++++- TTS/trainer.py | 16 -------- TTS/tts/configs/vits_config.py | 8 ++++ TTS/tts/models/__init__.py | 11 ++++- TTS/tts/models/base_tts.py | 13 +++++- TTS/tts/models/vits.py | 48 ++++++++-------------- TTS/tts/utils/languages.py | 73 ++++++++++++---------------------- 7 files changed, 82 insertions(+), 99 deletions(-) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index a543a947..5330649a 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -5,6 +5,7 @@ from TTS.trainer import Trainer, TrainingArgs from TTS.tts.datasets import load_tts_samples from TTS.tts.models import setup_model from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.languages import LanguageManager from TTS.utils.audio import AudioProcessor @@ -60,8 +61,17 @@ def main(): else: speaker_manager = None + if hasattr(config, "use_language_embedding") and config.use_language_embedding: + language_manager = LanguageManager(config=config) + if hasattr(config, "model_args"): + config.model_args.num_languages = language_manager.num_languages + else: + config.num_languages = language_manager.num_languages + else: + language_manager = None + # init the model from config - model = setup_model(config, speaker_manager) + model = setup_model(config, speaker_manager, language_manager) # init the trainer and 🚀 trainer = Trainer( diff --git a/TTS/trainer.py b/TTS/trainer.py index b9026c8e..7bffb386 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -260,22 +260,6 @@ class Trainer: else: self.run_get_model(self.config, get_model) - if hasattr(self.model, "init_multilingual"): - self.model.init_multilingual(self.config, self.train_samples + self.eval_samples) - config = self.config.model_args if hasattr(self.config, "model_args") else self.config - # save speakers json - if config.use_language_embedding and self.model.language_manager.num_languages > 1: - self.model.language_manager.save_language_ids_to_file( - os.path.join(self.output_path, "language_ids.json") - ) - if hasattr(self.config, "model_args"): - self.config.model_args["num_languages"] = self.model.language_manager.num_languages - else: - self.config.num_languages = self.model.language_manager.num_languages - - # update config file - copy_model_files(self.config, self.output_path) - # setup criterion self.criterion = self.get_criterion(self.model) diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index 178992a7..32a69bca 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -85,6 +85,12 @@ class VitsConfig(BaseTTSConfig): test_sentences (List[List]): List of sentences with speaker and language information to be used for testing. + language_ids_file (str): + Path to the language ids file. + + use_language_embedding (bool): + If true, language embedding is used. Defaults to `False`. + Note: Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters. @@ -147,6 +153,8 @@ class VitsConfig(BaseTTSConfig): use_speaker_embedding: bool = False speakers_file: str = None speaker_embedding_channels: int = 256 + language_ids_file: str = None + use_language_embedding: bool = False # use d-vectors use_d_vector_file: bool = False diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py index 780f22cd..acd89110 100644 --- a/TTS/tts/models/__init__.py +++ b/TTS/tts/models/__init__.py @@ -2,7 +2,11 @@ from TTS.tts.utils.text.symbols import make_symbols, parse_symbols from TTS.utils.generic_utils import find_module -def setup_model(config, speaker_manager: "SpeakerManager" = None): +def setup_model( + config, + speaker_manager: "SpeakerManager" = None, + language_manager: "LanguageManager" = None + ): print(" > Using model: {}".format(config.model)) # fetch the right model implementation. if "base_model" in config and config["base_model"] is not None: @@ -31,7 +35,10 @@ def setup_model(config, speaker_manager: "SpeakerManager" = None): config.model_params.num_chars = num_chars if "model_args" in config: config.model_args.num_chars = num_chars - model = MyModel(config, speaker_manager=speaker_manager) + if config.model.lower() in ["vits"]: # If model supports multiple languages + model = MyModel(config, speaker_manager=speaker_manager, language_manager=language_manager) + else: + model = MyModel(config, speaker_manager=speaker_manager) return model diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 707fc9c3..14bc9180 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -419,8 +419,7 @@ class BaseTTS(BaseModel): return test_figures, test_audios def on_init_start(self, trainer): - """Save the speaker.json at the beginning of the training. And update the config.json with the - speakers.json file path.""" + """Save the speaker.json and language_ids.json at the beginning of the training. Also update both paths.""" if self.speaker_manager is not None: output_path = os.path.join(trainer.output_path, "speakers.json") self.speaker_manager.save_speaker_ids_to_file(output_path) @@ -431,3 +430,13 @@ class BaseTTS(BaseModel): trainer.config.save_json(os.path.join(trainer.output_path, "config.json")) print(f" > `speakers.json` is saved to {output_path}.") print(" > `speakers_file` is updated in the config.json.") + + if hasattr(self, "language_manager") and self.language_manager is not None: + output_path = os.path.join(trainer.output_path, "language_ids.json") + self.language_manager.save_language_ids_to_file(output_path) + trainer.config.language_ids_file = output_path + if hasattr(trainer.config, "model_args"): + trainer.config.model_args.language_ids_file = output_path + trainer.config.save_json(os.path.join(trainer.output_path, "config.json")) + print(f" > `language_ids.json` is saved to {output_path}.") + print(" > `language_ids_file` is updated in the config.json.") diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index c1c29980..ca110eb0 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -16,8 +16,8 @@ from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlock from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask -from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.languages import LanguageManager +from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.visual import plot_alignment from TTS.utils.trainer_utils import get_optimizer, get_scheduler @@ -158,6 +158,9 @@ class VitsArgs(Coqpit): num_languages (int): Number of languages for the language embedding layer. Defaults to 0. + language_ids_file (str): + Path to the language mapping file for the Language Manager. Defaults to None. + use_speaker_encoder_as_loss (bool): Enable/Disable Speaker Consistency Loss (SCL). Defaults to False. @@ -225,6 +228,7 @@ class VitsArgs(Coqpit): use_language_embedding: bool = False embedded_language_dim: int = 4 num_languages: int = 0 + language_ids_file: str = None use_speaker_encoder_as_loss: bool = False speaker_encoder_config_path: str = "" speaker_encoder_model_path: str = "" @@ -265,13 +269,18 @@ class Vits(BaseTTS): # pylint: disable=dangerous-default-value - def __init__(self, config: Coqpit, speaker_manager: SpeakerManager = None): + def __init__( + self, + config: Coqpit, + speaker_manager: SpeakerManager = None, + language_manager: LanguageManager = None, + ): super().__init__(config) self.END2END = True self.speaker_manager = speaker_manager - self.audio_config = config["audio"] + self.language_manager = language_manager if config.__class__.__name__ == "VitsConfig": # loading from VitsConfig if "num_chars" not in config: @@ -443,43 +452,20 @@ class Vits(BaseTTS): self.speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file) self.embedded_speaker_dim = config.d_vector_dim - if config.use_speaker_encoder_as_loss: - if not config.speaker_encoder_model_path or not config.speaker_encoder_config_path: - raise RuntimeError(" [!] To use the speaker encoder loss you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!") - self.speaker_manager.init_speaker_encoder(config.speaker_encoder_model_path, config.speaker_encoder_config_path) - self.speaker_encoder = self.speaker_manager.speaker_encoder.train() - for param in self.speaker_encoder.parameters(): - param.requires_grad = False - - print(" > External Speaker Encoder Loaded !!") - - if hasattr(self.speaker_encoder, "audio_config") and self.audio_config["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"]: - self.audio_transform = torchaudio.transforms.Resample(orig_freq=self.audio_config["sample_rate"], new_freq=self.speaker_encoder.audio_config["sample_rate"]) - else: - self.audio_transform = None - else: - self.audio_transform = None - self.speaker_encoder = None - - def init_multilingual(self, config: Coqpit, data: List = None): + def init_multilingual(self, config: Coqpit): """Initialize multilingual modules of a model. Args: config (Coqpit): Model configuration. - data (List, optional): Dataset items to infer number of speakers. Defaults to None. """ if hasattr(config, "model_args"): config = config.model_args - # init language manager - self.language_manager = LanguageManager(config, data=data) - # init language embedding layer - if config.use_language_embedding: - if config.num_languages > 0 and self.language_manager.num_languages == 0: - self.num_languages = config.num_languages - else: - self.num_languages = self.language_manager.num_languages + if config.language_ids_file is not None: + self.language_manager = LanguageManager(language_ids_file_path=config.language_ids_file) + if config.use_language_embedding and self.language_manager: + self.num_languages = self.language_manager.num_languages self.embedded_language_dim = config.embedded_language_dim self.emb_l = nn.Embedding(self.num_languages, self.embedded_language_dim) torch.nn.init.xavier_uniform_(self.emb_l.weight) diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py index 5bacc259..451b10f9 100644 --- a/TTS/tts/utils/languages.py +++ b/TTS/tts/utils/languages.py @@ -1,6 +1,6 @@ import json import os -from typing import Dict, List, Tuple +from typing import Dict, List import fsspec import numpy as np @@ -14,11 +14,13 @@ class LanguageManager: in a way that can be queried by language. Args: - language_id_file_path (str, optional): Path to the metafile that maps language names to ids used by + language_ids_file_path (str, optional): Path to the metafile that maps language names to ids used by TTS models. Defaults to "". + config (Coqpit, optional): Coqpit config that contains the language information in the datasets filed. + Defaults to None. Examples: - >>> manager = LanguageManager(language_id_file_path=language_id_file_path) + >>> manager = LanguageManager(language_ids_file_path=language_ids_file_path) >>> language_id_mapper = manager.language_ids """ @@ -26,10 +28,14 @@ class LanguageManager: def __init__( self, - language_id_file_path: str = "", + language_ids_file_path: str = "", + config: Coqpit = None, ): - if language_id_file_path: - self.set_language_ids_from_file(language_id_file_path) + if language_ids_file_path: + self.set_language_ids_from_file(language_ids_file_path) + + if config: + self.set_language_ids_from_config(config) @staticmethod def _load_json(json_file_path: str) -> Dict: @@ -50,27 +56,30 @@ class LanguageManager: return list(self.language_id_mapping.keys()) @staticmethod - def parse_languages_from_data(items: list) -> Tuple[Dict, int]: - """Parse language IDs from data samples retured by `load_meta_data()`. + def parse_language_ids_from_config(c: Coqpit) -> Dict: + """Set language id from config. Args: - items (list): Data sampled returned by `load_meta_data()`. + c (Coqpit): Config Returns: - Tuple[Dict, int]: language IDs and number of languages. + Tuple[Dict, int]: Language ID mapping and the number of languages. """ - languages = sorted({item[3] for item in items}) - language_ids = {name: i for i, name in enumerate(languages)} - num_languages = len(language_ids) - return language_ids, num_languages + languages = set({}) + for dataset in c.datasets: + if "language" in dataset: + languages.add(dataset["language"]) + else: + raise ValueError(f"Dataset {dataset['name']} has no language specified.") + return {name: i for i, name in enumerate(sorted(list(languages)))} - def set_language_ids_from_data(self, items: List) -> None: - """Set language IDs from data samples. + def set_language_ids_from_config(self, c: Coqpit) -> None: + """Set language IDs from config samples. Args: items (List): Data sampled returned by `load_meta_data()`. """ - self.language_id_mapping, _ = self.parse_languages_from_data(items) + self.language_id_mapping = self.parse_language_ids_from_config(c) def set_language_ids_from_file(self, file_path: str) -> None: """Load language ids from a json file. @@ -102,36 +111,6 @@ def _set_file_path(path): return None -def get_language_manager(c: Coqpit, data: List = None, restore_path: str = None) -> LanguageManager: - """Initiate a `LanguageManager` instance by the provided config. - - Args: - c (Coqpit): Model configuration. - restore_path (str): Path to a previous training folder. - data (List): Data sampled returned by `load_meta_data()`. Defaults to None. - out_path (str, optional): Save the generated language IDs to a output path. Defaults to None. - - Returns: - SpeakerManager: initialized and ready to use instance. - """ - language_manager = LanguageManager() - if c.use_language_embedding: - if data is not None: - language_manager.set_language_ids_from_data(data) - if restore_path: - language_file = _set_file_path(restore_path) - # restoring language manager from a previous run. - if language_file: - language_manager.set_language_ids_from_file(language_file) - if language_manager.num_languages > 0: - print( - " > Language manager is loaded with {} languages: {}".format( - language_manager.num_languages, ", ".join(language_manager.language_names) - ) - ) - return language_manager - - def get_language_weighted_sampler(items: list): language_names = np.array([item[3] for item in items]) unique_language_names = np.unique(language_names).tolist() From 9cfbacc622cdbaa752d0036b1b523190f7dda253 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Sat, 27 Nov 2021 23:41:55 +0100 Subject: [PATCH 182/220] Fix trailing space --- TTS/tts/configs/vits_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py index 32a69bca..36c948af 100644 --- a/TTS/tts/configs/vits_config.py +++ b/TTS/tts/configs/vits_config.py @@ -87,7 +87,7 @@ class VitsConfig(BaseTTSConfig): language_ids_file (str): Path to the language ids file. - + use_language_embedding (bool): If true, language embedding is used. Defaults to `False`. From 74cedfac3855eb46ddfdc3d82327c04dcf06c90b Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Sun, 28 Nov 2021 00:23:55 +0100 Subject: [PATCH 183/220] Revert init multispeaker change --- TTS/tts/models/base_tts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 14bc9180..f1fdbd33 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -102,7 +102,7 @@ class BaseTTS(BaseModel): config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512 ) # init speaker embedding layer - if config.use_speaker_embedding: + if config.use_speaker_embedding and not config.use_d_vector_file:: print(" > Init speaker_embedding layer.") self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) From 2bbcb558dc74950f9555d705b716328479e3e0ac Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Sun, 28 Nov 2021 00:48:53 +0100 Subject: [PATCH 184/220] Prevent weighted sampler use when num_gpus > 1 --- TTS/tts/models/base_tts.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index f1fdbd33..1f92bfc7 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -102,7 +102,7 @@ class BaseTTS(BaseModel): config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512 ) # init speaker embedding layer - if config.use_speaker_embedding and not config.use_d_vector_file:: + if config.use_speaker_embedding and not config.use_d_vector_file: print(" > Init speaker_embedding layer.") self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) @@ -337,8 +337,15 @@ class BaseTTS(BaseModel): if config.compute_f0: dataset.pitch_extractor.load_pitch_stats(config.get("f0_cache_path", None)) + + # sampler for DDP sampler = DistributedSampler(dataset) if num_gpus > 1 else None + + # Weighted samplers + assert not (num_gpus > 1 and getattr(config, "use_language_weighted_sampler", False)), "language_weighted_sampler is not supported with DistributedSampler" + assert not (num_gpus > 1 and getattr(config, "use_speaker_weighted_sampler", False)), "speaker_weighted_sampler is not supported with DistributedSampler" + if sampler is None: if getattr(config, "use_language_weighted_sampler", False): print(" > Using Language weighted sampler") From 6f01eed6725d23c7835c859880e145612f69bb18 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 1 Dec 2021 23:36:29 +0100 Subject: [PATCH 185/220] Add test for language_weighted_sampler --- tests/data_tests/test_samplers.py | 53 +++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 tests/data_tests/test_samplers.py diff --git a/tests/data_tests/test_samplers.py b/tests/data_tests/test_samplers.py new file mode 100644 index 00000000..11e9082f --- /dev/null +++ b/tests/data_tests/test_samplers.py @@ -0,0 +1,53 @@ +from torch.utils.data import RandomSampler +from TTS.tts.datasets import load_tts_samples +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.utils.languages import get_language_weighted_sampler + +import functools + +dataset_config_en = BaseDatasetConfig( + name="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="en", +) + +dataset_config_pt = BaseDatasetConfig( + name="ljspeech", + meta_file_train="metadata.csv", + meta_file_val="metadata.csv", + path="tests/data/ljspeech", + language="pt-br", +) + +# Adding the EN samples twice to create an unbalanced dataset +train_samples, eval_samples = load_tts_samples( + [dataset_config_en, dataset_config_en, dataset_config_pt], + eval_split=True +) + +def is_balanced(lang_1, lang_2): + return 0.9 < lang_1/lang_2 < 1.1 + +random_sampler = RandomSampler(train_samples) +ids = functools.reduce(lambda a, b: a + b, [list(random_sampler) for i in range(100)]) +en, pt = 0, 0 +for id in ids: + if train_samples[id][3] == 'en': + en += 1 + else: + pt += 1 + +assert not is_balanced(en, pt), "Random sampler is supposed to be unbalanced" + +weighted_sampler = get_language_weighted_sampler(train_samples) +ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)]) +en, pt = 0, 0 +for id in ids: + if train_samples[id][3] == 'en': + en += 1 + else: + pt += 1 + +assert is_balanced(en, pt), "Weighted sampler is supposed to be balanced" \ No newline at end of file From 8b3769c95751f9f1a6e74780ff470864a487f169 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 1 Dec 2021 23:48:38 +0100 Subject: [PATCH 186/220] Fix seed in test_samplers to avoid random fails --- tests/data_tests/test_samplers.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/data_tests/test_samplers.py b/tests/data_tests/test_samplers.py index 11e9082f..5e4e4151 100644 --- a/tests/data_tests/test_samplers.py +++ b/tests/data_tests/test_samplers.py @@ -1,10 +1,12 @@ -from torch.utils.data import RandomSampler from TTS.tts.datasets import load_tts_samples from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.utils.languages import get_language_weighted_sampler - +import torch import functools +# Fixing random state to avoid random fails +torch.manual_seed(0) + dataset_config_en = BaseDatasetConfig( name="ljspeech", meta_file_train="metadata.csv", @@ -23,18 +25,18 @@ dataset_config_pt = BaseDatasetConfig( # Adding the EN samples twice to create an unbalanced dataset train_samples, eval_samples = load_tts_samples( - [dataset_config_en, dataset_config_en, dataset_config_pt], + [dataset_config_en, dataset_config_en, dataset_config_pt], eval_split=True ) def is_balanced(lang_1, lang_2): - return 0.9 < lang_1/lang_2 < 1.1 + return 0.85 < lang_1/lang_2 < 1.2 -random_sampler = RandomSampler(train_samples) +random_sampler = torch.utils.data.RandomSampler(train_samples) ids = functools.reduce(lambda a, b: a + b, [list(random_sampler) for i in range(100)]) en, pt = 0, 0 -for id in ids: - if train_samples[id][3] == 'en': +for index in ids: + if train_samples[index][3] == 'en': en += 1 else: pt += 1 @@ -44,10 +46,10 @@ assert not is_balanced(en, pt), "Random sampler is supposed to be unbalanced" weighted_sampler = get_language_weighted_sampler(train_samples) ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)]) en, pt = 0, 0 -for id in ids: - if train_samples[id][3] == 'en': +for index in ids: + if train_samples[index][3] == 'en': en += 1 else: pt += 1 -assert is_balanced(en, pt), "Weighted sampler is supposed to be balanced" \ No newline at end of file +assert is_balanced(en, pt), "Weighted sampler is supposed to be balanced" From a564eb9f5420cb8607bf43787f7ff77753f631a5 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 8 Dec 2021 19:34:36 +0100 Subject: [PATCH 187/220] Add support for multi-lingual models in CLI --- TTS/bin/synthesize.py | 30 +++++++++++++++++++-- TTS/tts/utils/languages.py | 1 + TTS/utils/synthesizer.py | 54 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 81 insertions(+), 4 deletions(-) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index bf7de798..509b3da6 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -152,12 +152,19 @@ If you don't specify any models, then it uses LJSpeech based English model. # args for multi-speaker synthesis parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) + parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None) parser.add_argument( "--speaker_idx", type=str, help="Target speaker ID for a multi-speaker TTS model.", default=None, ) + parser.add_argument( + "--language_idx", + type=str, + help="Target language ID for a multi-lingual TTS model.", + default=None, + ) parser.add_argument( "--speaker_wav", nargs="+", @@ -173,6 +180,14 @@ If you don't specify any models, then it uses LJSpeech based English model. const=True, default=False, ) + parser.add_argument( + "--list_language_idxs", + help="List available language ids for the defined multi-lingual model.", + type=str2bool, + nargs="?", + const=True, + default=False, + ) # aux args parser.add_argument( "--save_spectogram", @@ -184,7 +199,7 @@ If you don't specify any models, then it uses LJSpeech based English model. args = parser.parse_args() # print the description if either text or list_models is not set - if args.text is None and not args.list_models and not args.list_speaker_idxs: + if args.text is None and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs: parser.parse_args(["-h"]) # load model manager @@ -194,6 +209,7 @@ If you don't specify any models, then it uses LJSpeech based English model. model_path = None config_path = None speakers_file_path = None + language_ids_file_path = None vocoder_path = None vocoder_config_path = None encoder_path = None @@ -217,6 +233,7 @@ If you don't specify any models, then it uses LJSpeech based English model. model_path = args.model_path config_path = args.config_path speakers_file_path = args.speakers_file_path + language_ids_file_path = args.language_ids_file_path if args.vocoder_path is not None: vocoder_path = args.vocoder_path @@ -231,6 +248,7 @@ If you don't specify any models, then it uses LJSpeech based English model. model_path, config_path, speakers_file_path, + language_ids_file_path, vocoder_path, vocoder_config_path, encoder_path, @@ -246,6 +264,14 @@ If you don't specify any models, then it uses LJSpeech based English model. print(synthesizer.tts_model.speaker_manager.speaker_ids) return + # query langauge ids of a multi-lingual model. + if args.list_language_idxs: + print( + " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." + ) + print(synthesizer.tts_model.language_manager.language_id_mapping) + return + # check the arguments against a multi-speaker model. if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav): print( @@ -258,7 +284,7 @@ If you don't specify any models, then it uses LJSpeech based English model. print(" > Text: {}".format(args.text)) # kick it - wav = synthesizer.tts(args.text, args.speaker_idx, args.speaker_wav, args.gst_style) + wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav) # save the results print(" > Saving output to {}".format(args.out_path)) diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py index 451b10f9..fc7eec57 100644 --- a/TTS/tts/utils/languages.py +++ b/TTS/tts/utils/languages.py @@ -31,6 +31,7 @@ class LanguageManager: language_ids_file_path: str = "", config: Coqpit = None, ): + self.language_id_mapping = {} if language_ids_file_path: self.set_language_ids_from_file(language_ids_file_path) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 043c4982..ea8ce6d1 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -8,6 +8,7 @@ import torch from TTS.config import load_config from TTS.tts.models import setup_model as setup_tts_model from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.languages import LanguageManager # pylint: disable=unused-wildcard-import # pylint: disable=wildcard-import @@ -23,6 +24,7 @@ class Synthesizer(object): tts_checkpoint: str, tts_config_path: str, tts_speakers_file: str = "", + tts_languages_file: str = "", vocoder_checkpoint: str = "", vocoder_config: str = "", encoder_checkpoint: str = "", @@ -52,6 +54,7 @@ class Synthesizer(object): self.tts_checkpoint = tts_checkpoint self.tts_config_path = tts_config_path self.tts_speakers_file = tts_speakers_file + self.tts_languages_file = tts_languages_file self.vocoder_checkpoint = vocoder_checkpoint self.vocoder_config = vocoder_config self.encoder_checkpoint = encoder_checkpoint @@ -63,6 +66,9 @@ class Synthesizer(object): self.speaker_manager = None self.num_speakers = 0 self.tts_speakers = {} + self.language_manager = None + self.num_languages = 0 + self.tts_languages = {} self.d_vector_dim = 0 self.seg = self._get_segmenter("en") self.use_cuda = use_cuda @@ -110,8 +116,13 @@ class Synthesizer(object): self.ap = AudioProcessor(verbose=False, **self.tts_config.audio) speaker_manager = self._init_speaker_manager() + language_manager = self._init_language_manager() - self.tts_model = setup_tts_model(config=self.tts_config, speaker_manager=speaker_manager) + self.tts_model = setup_tts_model( + config=self.tts_config, + speaker_manager=speaker_manager, + language_manager=language_manager, + ) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda() @@ -133,6 +144,17 @@ class Synthesizer(object): speaker_manager = SpeakerManager(d_vectors_file_path=self.tts_config.d_vector_file) return speaker_manager + def _init_language_manager(self): + """Initialize the LanguageManager""" + # setup if multi-lingual settings are in the global model config + language_manager = None + if hasattr(self.tts_config, "use_language_embedding") and self.tts_config.use_language_embedding is True: + if self.tts_languages_file: + language_manager = LanguageManager(language_ids_file_path=self.tts_languages_file) + elif self.tts_config.get("language_ids_file", None): + language_manager = LanguageManager(language_ids_file_path=self.tts_config.language_ids_file) + return language_manager + def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None: """Load the vocoder model. @@ -174,12 +196,20 @@ class Synthesizer(object): wav = np.array(wav) self.ap.save_wav(wav, path, self.output_sample_rate) - def tts(self, text: str, speaker_idx: str = "", speaker_wav=None, style_wav=None) -> List[int]: + def tts( + self, + text: str, + speaker_idx: str = "", + language_idx: str = "", + speaker_wav=None, + style_wav=None + ) -> List[int]: """🐸 TTS magic. Run all the models and generate speech. Args: text (str): input text. speaker_idx (str, optional): spekaer id for multi-speaker models. Defaults to "". + language_idx (str, optional): language id for multi-language models. Defaults to "". speaker_wav (): style_wav ([type], optional): style waveform for GST. Defaults to None. @@ -219,6 +249,24 @@ class Synthesizer(object): "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. " ) + # handle multi-lingaul + language_id = None + if self.tts_languages_file or hasattr(self.tts_model.language_manager, "language_id_mapping"): + if language_idx and isinstance(language_idx, str): + language_id = self.tts_model.language_manager.language_id_mapping[language_idx] + + elif not language_idx: + raise ValueError( + " [!] Look like you use a multi-lingual model. " + "You need to define either a `language_idx` or a `style_wav` to use a multi-lingual model." + ) + + else: + raise ValueError( + f" [!] Missing language_ids.json file path for selecting language {language_idx}." + "Define path for language_ids.json if it is a multi-lingual model or remove defined language idx. " + ) + # compute a new d_vector from the given clip. if speaker_wav is not None: speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip(speaker_wav) @@ -234,6 +282,8 @@ class Synthesizer(object): use_cuda=self.use_cuda, ap=self.ap, speaker_id=speaker_id, + language_id=language_id, + language_name=language_idx, style_wav=style_wav, enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars, use_griffin_lim=use_gl, From 6700bb1bcfc972a193a3921538152120ef7c35b1 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 8 Dec 2021 19:42:45 +0100 Subject: [PATCH 188/220] Add recipe for multi-lingual VITS --- .../multilingual/vits_tts/train_vits_tts.py | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 recipes/multilingual/vits_tts/train_vits_tts.py diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py new file mode 100644 index 00000000..6beaef38 --- /dev/null +++ b/recipes/multilingual/vits_tts/train_vits_tts.py @@ -0,0 +1,117 @@ +import os +from glob import glob + +from TTS.config.shared_configs import BaseAudioConfig +from TTS.trainer import Trainer, TrainingArgs +from TTS.tts.configs.shared_configs import BaseDatasetConfig +from TTS.tts.configs.vits_config import VitsConfig +from TTS.tts.datasets import load_tts_samples +from TTS.tts.models.vits import Vits, VitsArgs +from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.languages import LanguageManager +from TTS.utils.audio import AudioProcessor + +output_path = os.path.dirname(os.path.abspath(__file__)) + +mailabs_path = '/home/julian/workspace/mailabs/**' +dataset_paths = glob(mailabs_path) +dataset_config = [BaseDatasetConfig(name="mailabs", meta_file_train=None, path=path, language=path.split('/')[-1]) for path in dataset_paths] + +audio_config = BaseAudioConfig( + sample_rate=16000, + win_length=1024, + hop_length=256, + num_mels=80, + preemphasis=0.0, + ref_level_db=20, + log_func="np.log", + do_trim_silence=False, + trim_db=23.0, + mel_fmin=0, + mel_fmax=None, + spec_gain=1.0, + signal_norm=True, + do_amp_to_db_linear=False, + resample=False, +) + +vitsArgs = VitsArgs( + use_language_embedding=True, + embedded_language_dim=4, + use_speaker_embedding=True, + use_sdp=False, +) + +config = VitsConfig( + model_args=vitsArgs, + audio=audio_config, + run_name="vits_vctk", + use_speaker_embedding=True, + batch_size=32, + eval_batch_size=16, + batch_group_size=0, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + text_cleaner="multilingual_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + compute_input_seq_cache=True, + print_step=25, + use_language_weighted_sampler= True, + print_eval=False, + mixed_precision=False, + sort_by_audio_len=True, + min_seq_len=32 * 256 * 4, + max_seq_len=160000, + output_path=output_path, + datasets=dataset_config, + characters= { + "pad": "_", + "eos": "&", + "bos": "*", + "characters": "!¡'(),-.:;¿?abcdefghijklmnopqrstuvwxyzµßàáâäåæçèéêëìíîïñòóôöùúûüąćęłńœśşźżƒабвгдежзийклмнопрстуфхцчшщъыьэюяёєіїґӧ «°±µ»$%&‘’‚“`”„", + "punctuations": "!¡'(),-.:;¿? ", + "phonemes": None, + "unique": True + }, + test_sentences=[ + ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", 'mary_ann', None, 'en_US'], + ["Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.", "ezwa", None, 'fr_FR'], + ["Ich finde, dieses Startup ist wirklich unglaublich.", "eva_k", None, 'de_DE'], + ["Я думаю, что этот стартап действительно удивительный.", "oblomov", None, 'ru_RU'], + ] +) + +# init audio processor +ap = AudioProcessor(**config.audio.to_dict()) + +# load training samples +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) + +# init speaker manager for multi-speaker training +# it maps speaker-id to speaker-name in the model and data-loader +speaker_manager = SpeakerManager() +speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +config.model_args.num_speakers = speaker_manager.num_speakers + +language_manager = LanguageManager(config=config) +config.model_args.num_languages = language_manager.num_languages + +# init model +model = Vits(config, speaker_manager, language_manager) + +# init the trainer and 🚀 +trainer = Trainer( + TrainingArgs(), + config, + output_path, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, + training_assets={"audio_processor": ap}, +) +trainer.fit() From 54b7fb4e4a256c3758fd69ebc1522e47ec751ec3 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Thu, 9 Dec 2021 12:42:38 +0100 Subject: [PATCH 189/220] Fix zoo tests --- TTS/utils/synthesizer.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index ea8ce6d1..e6df6561 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -118,11 +118,14 @@ class Synthesizer(object): speaker_manager = self._init_speaker_manager() language_manager = self._init_language_manager() - self.tts_model = setup_tts_model( - config=self.tts_config, - speaker_manager=speaker_manager, - language_manager=language_manager, - ) + if language_manager is not None: + self.tts_model = setup_tts_model( + config=self.tts_config, + speaker_manager=speaker_manager, + language_manager=language_manager, + ) + else: + self.tts_model = setup_tts_model(config=self.tts_config, speaker_manager=speaker_manager) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda() @@ -251,7 +254,7 @@ class Synthesizer(object): # handle multi-lingaul language_id = None - if self.tts_languages_file or hasattr(self.tts_model.language_manager, "language_id_mapping"): + if self.tts_languages_file or (hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None): if language_idx and isinstance(language_idx, str): language_id = self.tts_model.language_manager.language_id_mapping[language_idx] From 704dddcffaf62d7e9fdcbf218070ad987a203ddc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 10 Dec 2021 07:53:10 +0000 Subject: [PATCH 190/220] Make style --- TTS/bin/extract_tts_spectrograms.py | 1 - TTS/bin/find_unique_phonemes.py | 1 + TTS/bin/remove_silence_using_vad.py | 14 ++++---- TTS/bin/train_tts.py | 2 +- TTS/server/server.py | 2 +- TTS/speaker_encoder/models/resnet.py | 11 +++--- TTS/tts/datasets/dataset.py | 2 +- TTS/tts/layers/losses.py | 2 +- TTS/tts/models/__init__.py | 8 ++--- TTS/tts/models/base_tts.py | 21 +++++++---- TTS/tts/models/vits.py | 7 ++-- TTS/utils/audio.py | 10 ++++-- TTS/utils/synthesizer.py | 13 +++---- TTS/utils/vad.py | 10 +++--- .../multilingual/vits_tts/train_vits_tts.py | 35 +++++++++++++------ requirements.txt | 1 + tests/aux_tests/test_find_unique_phonemes.py | 10 ++---- tests/data_tests/test_samplers.py | 21 ++++++----- 18 files changed, 96 insertions(+), 75 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 014ba4e8..7b489fd6 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -242,7 +242,6 @@ def main(args): # pylint: disable=redefined-outer-name else: speaker_manager = None - # setup model model = setup_model(c) diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index 832ef082..d3143ca3 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -19,6 +19,7 @@ def compute_phonemes(item): return [] return list(set(ph)) + def main(): # pylint: disable=W0601 global c diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py index a32f0f45..9070f2da 100755 --- a/TTS/bin/remove_silence_using_vad.py +++ b/TTS/bin/remove_silence_using_vad.py @@ -1,12 +1,13 @@ -import os -import glob -import pathlib import argparse +import glob import multiprocessing +import os +import pathlib from tqdm.contrib.concurrent import process_map -from TTS.utils.vad import read_wave, write_wave, get_vad_speech_segments +from TTS.utils.vad import get_vad_speech_segments, read_wave, write_wave + def remove_silence(filepath): output_path = filepath.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, "")) @@ -69,10 +70,7 @@ if __name__ == "__main__": parser.add_argument( "-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir" ) - parser.add_argument("-f", "--force", - default=False, - action='store_true', - help='Force the replace of exists files') + parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files") parser.add_argument( "-g", "--glob", diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 5330649a..191cba00 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -4,8 +4,8 @@ from TTS.config import load_config, register_config from TTS.trainer import Trainer, TrainingArgs from TTS.tts.datasets import load_tts_samples from TTS.tts.models import setup_model -from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.languages import LanguageManager +from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor diff --git a/TTS/server/server.py b/TTS/server/server.py index c6d67141..f7bc79c4 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -100,7 +100,7 @@ if args.vocoder_path is not None: # load models synthesizer = Synthesizer( - model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, use_cuda=args.use_cuda + model_path, config_path, speakers_file_path, None, vocoder_path, vocoder_config_path, use_cuda=args.use_cuda ) use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and synthesizer.tts_model.num_speakers > 1 diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index 8f0a8809..7bd507fb 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -2,11 +2,12 @@ import numpy as np import torch from torch import nn +from TTS.utils.audio import TorchSTFT +from TTS.utils.io import load_fsspec + # import torchaudio -from TTS.utils.audio import TorchSTFT -from TTS.utils.io import load_fsspec class PreEmphasis(torch.nn.Module): @@ -126,16 +127,16 @@ class ResNetSpeakerEncoder(nn.Module): n_mels=audio_config["num_mels"], power=2.0, use_mel=True, - mel_norm=None + mel_norm=None, ), - '''torchaudio.transforms.MelSpectrogram( + """torchaudio.transforms.MelSpectrogram( sample_rate=audio_config["sample_rate"], n_fft=audio_config["fft_size"], win_length=audio_config["win_length"], hop_length=audio_config["hop_length"], window_fn=torch.hamming_window, n_mels=audio_config["num_mels"], - ),''' + ),""", ) else: self.torch_spec = None diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 000393ea..843cea58 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -531,7 +531,7 @@ class TTSDataset(Dataset): "waveform": wav_padded, "raw_text": batch["raw_text"], "pitch": pitch, - "language_ids": language_ids + "language_ids": language_ids, } raise TypeError( diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 9c219998..7de45041 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -588,7 +588,7 @@ class VitsGeneratorLoss(nn.Module): @staticmethod def cosine_similarity_loss(gt_spk_emb, syn_spk_emb): - l = - torch.nn.functional.cosine_similarity(gt_spk_emb, syn_spk_emb).mean() + l = -torch.nn.functional.cosine_similarity(gt_spk_emb, syn_spk_emb).mean() return l def forward( diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py index acd89110..4cc8b658 100644 --- a/TTS/tts/models/__init__.py +++ b/TTS/tts/models/__init__.py @@ -2,11 +2,7 @@ from TTS.tts.utils.text.symbols import make_symbols, parse_symbols from TTS.utils.generic_utils import find_module -def setup_model( - config, - speaker_manager: "SpeakerManager" = None, - language_manager: "LanguageManager" = None - ): +def setup_model(config, speaker_manager: "SpeakerManager" = None, language_manager: "LanguageManager" = None): print(" > Using model: {}".format(config.model)) # fetch the right model implementation. if "base_model" in config and config["base_model"] is not None: @@ -35,7 +31,7 @@ def setup_model( config.model_params.num_chars = num_chars if "model_args" in config: config.model_args.num_chars = num_chars - if config.model.lower() in ["vits"]: # If model supports multiple languages + if config.model.lower() in ["vits"]: # If model supports multiple languages model = MyModel(config, speaker_manager=speaker_manager, language_manager=language_manager) else: model = MyModel(config, speaker_manager=speaker_manager) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 1f92bfc7..e52cd765 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -12,8 +12,8 @@ from torch.utils.data.distributed import DistributedSampler from TTS.model import BaseModel from TTS.tts.configs.shared_configs import CharactersConfig from TTS.tts.datasets.dataset import TTSDataset -from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager, get_speaker_weighted_sampler from TTS.tts.utils.languages import LanguageManager, get_language_weighted_sampler +from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager, get_speaker_weighted_sampler from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text import make_symbols from TTS.tts.utils.visual import plot_alignment, plot_spectrogram @@ -150,7 +150,13 @@ class BaseTTS(BaseModel): if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None: language_id = self.language_manager.language_id_mapping[language_name] - return {"text": text, "speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector, "language_id": language_id} + return { + "text": text, + "speaker_id": speaker_id, + "style_wav": style_wav, + "d_vector": d_vector, + "language_id": language_id, + } def format_batch(self, batch: Dict) -> Dict: """Generic batch formatting for `TTSDataset`. @@ -337,14 +343,16 @@ class BaseTTS(BaseModel): if config.compute_f0: dataset.pitch_extractor.load_pitch_stats(config.get("f0_cache_path", None)) - - # sampler for DDP sampler = DistributedSampler(dataset) if num_gpus > 1 else None # Weighted samplers - assert not (num_gpus > 1 and getattr(config, "use_language_weighted_sampler", False)), "language_weighted_sampler is not supported with DistributedSampler" - assert not (num_gpus > 1 and getattr(config, "use_speaker_weighted_sampler", False)), "speaker_weighted_sampler is not supported with DistributedSampler" + assert not ( + num_gpus > 1 and getattr(config, "use_language_weighted_sampler", False) + ), "language_weighted_sampler is not supported with DistributedSampler" + assert not ( + num_gpus > 1 and getattr(config, "use_speaker_weighted_sampler", False) + ), "speaker_weighted_sampler is not supported with DistributedSampler" if sampler is None: if getattr(config, "use_language_weighted_sampler", False): @@ -354,7 +362,6 @@ class BaseTTS(BaseModel): print(" > Using Language weighted sampler") sampler = get_speaker_weighted_sampler(dataset.items) - loader = DataLoader( dataset, batch_size=config.eval_batch_size if is_eval else config.batch_size, diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index ca110eb0..5b4725b3 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -4,6 +4,7 @@ from itertools import chain from typing import Dict, List, Tuple import torch + # import torchaudio from coqpit import Coqpit from torch import nn @@ -420,8 +421,9 @@ class Vits(BaseTTS): ): # TODO: change this with torchaudio Resample raise RuntimeError( - ' [!] To use the speaker consistency loss (SCL) you need to have matching sample rates between the TTS model ({}) and the speaker encoder ({})!' - .format(self.config.audio["sample_rate"], self.speaker_encoder.audio_config["sample_rate"]) + " [!] To use the speaker consistency loss (SCL) you need to have matching sample rates between the TTS model ({}) and the speaker encoder ({})!".format( + self.config.audio["sample_rate"], self.speaker_encoder.audio_config["sample_rate"] + ) ) # pylint: disable=W0101,W0105 """ self.audio_transform = torchaudio.transforms.Resample( @@ -675,7 +677,6 @@ class Vits(BaseTTS): ) return outputs - def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}): """ Shapes: diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index 10c9ec7e..d01196c4 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -88,7 +88,7 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method spec_gain=1.0, power=None, use_htk=False, - mel_norm="slaney" + mel_norm="slaney", ): super().__init__() self.n_fft = n_fft @@ -155,7 +155,13 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method def _build_mel_basis(self): mel_basis = librosa.filters.mel( - self.sample_rate, self.n_fft, n_mels=self.n_mels, fmin=self.mel_fmin, fmax=self.mel_fmax, htk=self.use_htk, norm=self.mel_norm + self.sample_rate, + self.n_fft, + n_mels=self.n_mels, + fmin=self.mel_fmin, + fmax=self.mel_fmax, + htk=self.use_htk, + norm=self.mel_norm, ) self.mel_basis = torch.from_numpy(mel_basis).float() diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index e6df6561..d64c0936 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -7,8 +7,8 @@ import torch from TTS.config import load_config from TTS.tts.models import setup_model as setup_tts_model -from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.languages import LanguageManager +from TTS.tts.utils.speakers import SpeakerManager # pylint: disable=unused-wildcard-import # pylint: disable=wildcard-import @@ -200,12 +200,7 @@ class Synthesizer(object): self.ap.save_wav(wav, path, self.output_sample_rate) def tts( - self, - text: str, - speaker_idx: str = "", - language_idx: str = "", - speaker_wav=None, - style_wav=None + self, text: str, speaker_idx: str = "", language_idx: str = "", speaker_wav=None, style_wav=None ) -> List[int]: """🐸 TTS magic. Run all the models and generate speech. @@ -254,7 +249,9 @@ class Synthesizer(object): # handle multi-lingaul language_id = None - if self.tts_languages_file or (hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None): + if self.tts_languages_file or ( + hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None + ): if language_idx and isinstance(language_idx, str): language_id = self.tts_model.language_manager.language_id_mapping[language_idx] diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py index 33548087..923544d0 100644 --- a/TTS/utils/vad.py +++ b/TTS/utils/vad.py @@ -1,8 +1,9 @@ # This code is adpated from: https://github.com/wiseman/py-webrtcvad/blob/master/example.py -import wave -import webrtcvad -import contextlib import collections +import contextlib +import wave + +import webrtcvad def read_wave(path): @@ -37,7 +38,7 @@ class Frame(object): """Represents a "frame" of audio data.""" def __init__(self, _bytes, timestamp, duration): - self.bytes =_bytes + self.bytes = _bytes self.timestamp = timestamp self.duration = duration @@ -133,6 +134,7 @@ def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, fram if voiced_frames: yield b"".join([f.bytes for f in voiced_frames]) + def get_vad_speech_segments(audio, sample_rate, aggressiveness=2, padding_duration_ms=300): vad = webrtcvad.Vad(int(aggressiveness)) diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py index 6beaef38..be4747df 100644 --- a/recipes/multilingual/vits_tts/train_vits_tts.py +++ b/recipes/multilingual/vits_tts/train_vits_tts.py @@ -7,15 +7,18 @@ from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.vits import Vits, VitsArgs -from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.languages import LanguageManager +from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) -mailabs_path = '/home/julian/workspace/mailabs/**' +mailabs_path = "/home/julian/workspace/mailabs/**" dataset_paths = glob(mailabs_path) -dataset_config = [BaseDatasetConfig(name="mailabs", meta_file_train=None, path=path, language=path.split('/')[-1]) for path in dataset_paths] +dataset_config = [ + BaseDatasetConfig(name="mailabs", meta_file_train=None, path=path, language=path.split("/")[-1]) + for path in dataset_paths +] audio_config = BaseAudioConfig( sample_rate=16000, @@ -61,7 +64,7 @@ config = VitsConfig( phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), compute_input_seq_cache=True, print_step=25, - use_language_weighted_sampler= True, + use_language_weighted_sampler=True, print_eval=False, mixed_precision=False, sort_by_audio_len=True, @@ -69,21 +72,31 @@ config = VitsConfig( max_seq_len=160000, output_path=output_path, datasets=dataset_config, - characters= { + characters={ "pad": "_", "eos": "&", "bos": "*", "characters": "!¡'(),-.:;¿?abcdefghijklmnopqrstuvwxyzµßàáâäåæçèéêëìíîïñòóôöùúûüąćęłńœśşźżƒабвгдежзийклмнопрстуфхцчшщъыьэюяёєіїґӧ «°±µ»$%&‘’‚“`”„", "punctuations": "!¡'(),-.:;¿? ", "phonemes": None, - "unique": True + "unique": True, }, test_sentences=[ - ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", 'mary_ann', None, 'en_US'], - ["Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.", "ezwa", None, 'fr_FR'], - ["Ich finde, dieses Startup ist wirklich unglaublich.", "eva_k", None, 'de_DE'], - ["Я думаю, что этот стартап действительно удивительный.", "oblomov", None, 'ru_RU'], - ] + [ + "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + "mary_ann", + None, + "en_US", + ], + [ + "Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.", + "ezwa", + None, + "fr_FR", + ], + ["Ich finde, dieses Startup ist wirklich unglaublich.", "eva_k", None, "de_DE"], + ["Я думаю, что этот стартап действительно удивительный.", "oblomov", None, "ru_RU"], + ], ) # init audio processor diff --git a/requirements.txt b/requirements.txt index 3ec33ceb..453c3ec4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,3 +26,4 @@ unidic-lite==1.0.8 gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=2.0.0 fsspec>=2021.04.0 pyworld +webrtcvad diff --git a/tests/aux_tests/test_find_unique_phonemes.py b/tests/aux_tests/test_find_unique_phonemes.py index 33fad9ba..fa0abe4b 100644 --- a/tests/aux_tests/test_find_unique_phonemes.py +++ b/tests/aux_tests/test_find_unique_phonemes.py @@ -31,7 +31,7 @@ dataset_config_pt = BaseDatasetConfig( class TestFindUniquePhonemes(unittest.TestCase): @staticmethod def test_espeak_phonemes(): - # prepare the config + # prepare the config config = VitsConfig( batch_size=2, eval_batch_size=2, @@ -52,9 +52,7 @@ class TestFindUniquePhonemes(unittest.TestCase): config.save_json(config_path) # run test - run_cli( - f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"' - ) + run_cli(f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"') @staticmethod def test_no_espeak_phonemes(): @@ -79,6 +77,4 @@ class TestFindUniquePhonemes(unittest.TestCase): config.save_json(config_path) # run test - run_cli( - f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"' - ) + run_cli(f'CUDA_VISIBLE_DEVICES="" python TTS/bin/find_unique_phonemes.py --config_path "{config_path}"') diff --git a/tests/data_tests/test_samplers.py b/tests/data_tests/test_samplers.py index 5e4e4151..3d8d6c75 100644 --- a/tests/data_tests/test_samplers.py +++ b/tests/data_tests/test_samplers.py @@ -1,9 +1,11 @@ -from TTS.tts.datasets import load_tts_samples -from TTS.config.shared_configs import BaseDatasetConfig -from TTS.tts.utils.languages import get_language_weighted_sampler -import torch import functools +import torch + +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.datasets import load_tts_samples +from TTS.tts.utils.languages import get_language_weighted_sampler + # Fixing random state to avoid random fails torch.manual_seed(0) @@ -25,18 +27,19 @@ dataset_config_pt = BaseDatasetConfig( # Adding the EN samples twice to create an unbalanced dataset train_samples, eval_samples = load_tts_samples( - [dataset_config_en, dataset_config_en, dataset_config_pt], - eval_split=True + [dataset_config_en, dataset_config_en, dataset_config_pt], eval_split=True ) + def is_balanced(lang_1, lang_2): - return 0.85 < lang_1/lang_2 < 1.2 + return 0.85 < lang_1 / lang_2 < 1.2 + random_sampler = torch.utils.data.RandomSampler(train_samples) ids = functools.reduce(lambda a, b: a + b, [list(random_sampler) for i in range(100)]) en, pt = 0, 0 for index in ids: - if train_samples[index][3] == 'en': + if train_samples[index][3] == "en": en += 1 else: pt += 1 @@ -47,7 +50,7 @@ weighted_sampler = get_language_weighted_sampler(train_samples) ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)]) en, pt = 0, 0 for index in ids: - if train_samples[index][3] == 'en': + if train_samples[index][3] == "en": en += 1 else: pt += 1 From 649dc9e9daab09d9cd8e92dba4c3c1878a952be8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 10 Dec 2021 07:53:19 +0000 Subject: [PATCH 191/220] Remove redundant code --- TTS/tts/models/vits.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 5b4725b3..7f83f452 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -830,15 +830,6 @@ class Vits(BaseTTS): gt_spk_emb=outputs["gt_spk_emb"], syn_spk_emb=outputs["syn_spk_emb"], ) - # ignore duration loss if fine tuning mode is on - if not self.args.fine_tuning_mode: - # handle the duration loss - if self.args.use_sdp: - loss_dict["nll_duration"] = outputs["nll_duration"] - loss_dict["loss"] += outputs["nll_duration"] - else: - loss_dict["loss_duration"] = outputs["loss_duration"] - loss_dict["loss"] += outputs["loss_duration"] elif optimizer_idx == 1: # discriminator pass From 45b2f8e42e3e81fdfe667b97314ae927b72aba63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 10 Dec 2021 09:12:03 +0000 Subject: [PATCH 192/220] =?UTF-8?q?Add=20=F0=9F=91=91YourTTS=20docs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/source/models/vits.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/source/models/vits.md b/docs/source/models/vits.md index 5c0e92f6..0c303f7a 100644 --- a/docs/source/models/vits.md +++ b/docs/source/models/vits.md @@ -3,10 +3,15 @@ VITS (Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech ) is an End-to-End (encoder -> vocoder together) TTS model that takes advantage of SOTA DL techniques like GANs, VAE, Normalizing Flows. It does not require external alignment annotations and learns the text-to-audio alignment -using MAS as explained in the paper. The model architecture is a combination of GlowTTS encoder and HiFiGAN vocoder. +using MAS, as explained in the paper. The model architecture is a combination of GlowTTS encoder and HiFiGAN vocoder. It is a feed-forward model with x67.12 real-time factor on a GPU. +🐸 YourTTS is a multi-speaker and multi-lingual TTS model that can perform voice conversion and zero-shot speaker adaptation. +It can also learn a new language or voice with a ~ 1 minute long audio clip. This is a big open gate for training +TTS models in low-resources languages. 🐸 YourTTS uses VITS as the backbone architecture coupled with a speaker encoder model. + ## Important resources & papers +- 🐸 YourTTS: https://arxiv.org/abs/2112.02418 - VITS: https://arxiv.org/pdf/2106.06103.pdf - Neural Spline Flows: https://arxiv.org/abs/1906.04032 - Variational Autoencoder: https://arxiv.org/pdf/1312.6114.pdf From 7a987db62b37c0f17a048df10981a2273926b48e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 13 Dec 2021 16:23:57 +0000 Subject: [PATCH 193/220] Use torchaudio for ResNet speaker encoder --- TTS/speaker_encoder/models/resnet.py | 40 +++++++++++++--------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index 7bd507fb..3b96f270 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -5,12 +5,10 @@ from torch import nn from TTS.utils.audio import TorchSTFT from TTS.utils.io import load_fsspec -# import torchaudio +import torchaudio - - -class PreEmphasis(torch.nn.Module): +class PreEmphasis(nn.Module): def __init__(self, coefficient=0.97): super().__init__() self.coefficient = coefficient @@ -114,29 +112,29 @@ class ResNetSpeakerEncoder(nn.Module): if self.use_torch_spec: self.torch_spec = torch.nn.Sequential( PreEmphasis(audio_config["preemphasis"]), - TorchSTFT( - n_fft=audio_config["fft_size"], - hop_length=audio_config["hop_length"], - win_length=audio_config["win_length"], - sample_rate=audio_config["sample_rate"], - window="hamming_window", - mel_fmin=0.0, - mel_fmax=None, - use_htk=True, - do_amp_to_db=False, - n_mels=audio_config["num_mels"], - power=2.0, - use_mel=True, - mel_norm=None, - ), - """torchaudio.transforms.MelSpectrogram( + # TorchSTFT( + # n_fft=audio_config["fft_size"], + # hop_length=audio_config["hop_length"], + # win_length=audio_config["win_length"], + # sample_rate=audio_config["sample_rate"], + # window="hamming_window", + # mel_fmin=0.0, + # mel_fmax=None, + # use_htk=True, + # do_amp_to_db=False, + # n_mels=audio_config["num_mels"], + # power=2.0, + # use_mel=True, + # mel_norm=None, + # ) + torchaudio.transforms.MelSpectrogram( sample_rate=audio_config["sample_rate"], n_fft=audio_config["fft_size"], win_length=audio_config["win_length"], hop_length=audio_config["hop_length"], window_fn=torch.hamming_window, n_mels=audio_config["num_mels"], - ),""", + ) ) else: self.torch_spec = None From 35a781fb9001669520843db8637ffaddbbffc8f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 13 Dec 2021 16:27:36 +0000 Subject: [PATCH 194/220] Fix synthesizer reading `use_language_embedding` --- TTS/utils/synthesizer.py | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index d64c0936..bd90dd8c 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -151,7 +151,10 @@ class Synthesizer(object): """Initialize the LanguageManager""" # setup if multi-lingual settings are in the global model config language_manager = None - if hasattr(self.tts_config, "use_language_embedding") and self.tts_config.use_language_embedding is True: + if ( + hasattr(self.tts_config.model_args, "use_language_embedding") + and self.tts_config.model_args.use_language_embedding is True + ): if self.tts_languages_file: language_manager = LanguageManager(language_ids_file_path=self.tts_languages_file) elif self.tts_config.get("language_ids_file", None): @@ -200,14 +203,14 @@ class Synthesizer(object): self.ap.save_wav(wav, path, self.output_sample_rate) def tts( - self, text: str, speaker_idx: str = "", language_idx: str = "", speaker_wav=None, style_wav=None + self, text: str, speaker_name: str = "", language_name: str = "", speaker_wav=None, style_wav=None ) -> List[int]: """🐸 TTS magic. Run all the models and generate speech. Args: text (str): input text. - speaker_idx (str, optional): spekaer id for multi-speaker models. Defaults to "". - language_idx (str, optional): language id for multi-language models. Defaults to "". + speaker_name (str, optional): spekaer id for multi-speaker models. Defaults to "". + language_name (str, optional): language id for multi-language models. Defaults to "". speaker_wav (): style_wav ([type], optional): style waveform for GST. Defaults to None. @@ -224,26 +227,26 @@ class Synthesizer(object): speaker_embedding = None speaker_id = None if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"): - if speaker_idx and isinstance(speaker_idx, str): + if speaker_name and isinstance(speaker_name, str): if self.tts_config.use_d_vector_file: # get the speaker embedding from the saved d_vectors. - speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(speaker_idx)[0] + speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(speaker_name)[0] speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim] else: # get speaker idx from the speaker name - speaker_id = self.tts_model.speaker_manager.speaker_ids[speaker_idx] + speaker_id = self.tts_model.speaker_manager.speaker_ids[speaker_name] - elif not speaker_idx and not speaker_wav: + elif not speaker_name and not speaker_wav: raise ValueError( " [!] Look like you use a multi-speaker model. " - "You need to define either a `speaker_idx` or a `style_wav` to use a multi-speaker model." + "You need to define either a `speaker_name` or a `style_wav` to use a multi-speaker model." ) else: speaker_embedding = None else: - if speaker_idx: + if speaker_name: raise ValueError( - f" [!] Missing speakers.json file path for selecting speaker {speaker_idx}." + f" [!] Missing speakers.json file path for selecting speaker {speaker_name}." "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. " ) @@ -252,18 +255,18 @@ class Synthesizer(object): if self.tts_languages_file or ( hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None ): - if language_idx and isinstance(language_idx, str): - language_id = self.tts_model.language_manager.language_id_mapping[language_idx] + if language_name and isinstance(language_name, str): + language_id = self.tts_model.language_manager.language_id_mapping[language_name] - elif not language_idx: + elif not language_name: raise ValueError( " [!] Look like you use a multi-lingual model. " - "You need to define either a `language_idx` or a `style_wav` to use a multi-lingual model." + "You need to define either a `language_name` or a `style_wav` to use a multi-lingual model." ) else: raise ValueError( - f" [!] Missing language_ids.json file path for selecting language {language_idx}." + f" [!] Missing language_ids.json file path for selecting language {language_name}." "Define path for language_ids.json if it is a multi-lingual model or remove defined language idx. " ) @@ -283,7 +286,7 @@ class Synthesizer(object): ap=self.ap, speaker_id=speaker_id, language_id=language_id, - language_name=language_idx, + language_name=language_name, style_wav=style_wav, enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars, use_griffin_lim=use_gl, From 79de38ca76b05c860fbe6f1824fa0c513a16e975 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 13 Dec 2021 16:28:54 +0000 Subject: [PATCH 195/220] Rename setup_model to setup_speaker_encoder_model --- TTS/tts/utils/speakers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index c1eede3d..8f15aada 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -10,7 +10,7 @@ from coqpit import Coqpit from torch.utils.data.sampler import WeightedRandomSampler from TTS.config import load_config -from TTS.speaker_encoder.utils.generic_utils import setup_model +from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model from TTS.utils.audio import AudioProcessor @@ -252,7 +252,7 @@ class SpeakerManager: config_path (str): Model config file path. """ self.speaker_encoder_config = load_config(config_path) - self.speaker_encoder = setup_model(self.speaker_encoder_config) + self.speaker_encoder = setup_speaker_encoder_model(self.speaker_encoder_config) self.speaker_encoder.load_checkpoint(config_path, model_path, eval=True, use_cuda=self.use_cuda) self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio) # normalize the input audio level and trim silences From 3818bd0c2308fe31e5a3a811e33fb14931267341 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 13 Dec 2021 16:29:19 +0000 Subject: [PATCH 196/220] Fixup --- TTS/speaker_encoder/utils/generic_utils.py | 24 +++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/speaker_encoder/utils/generic_utils.py index c926e215..dab79f3c 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/speaker_encoder/utils/generic_utils.py @@ -170,21 +170,21 @@ def to_camel(text): return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) -def setup_model(c): - if c.model_params["model_name"].lower() == "lstm": +def setup_speaker_encoder_model(config: "Coqpit"): + if config.model_params["model_name"].lower() == "lstm": model = LSTMSpeakerEncoder( - c.model_params["input_dim"], - c.model_params["proj_dim"], - c.model_params["lstm_dim"], - c.model_params["num_lstm_layers"], + config.model_params["input_dim"], + config.model_params["proj_dim"], + config.model_params["lstm_dim"], + config.model_params["num_lstm_layers"], ) - elif c.model_params["model_name"].lower() == "resnet": + elif config.model_params["model_name"].lower() == "resnet": model = ResNetSpeakerEncoder( - input_dim=c.model_params["input_dim"], - proj_dim=c.model_params["proj_dim"], - log_input=c.model_params.get("log_input", False), - use_torch_spec=c.model_params.get("use_torch_spec", False), - audio_config=c.audio, + input_dim=config.model_params["input_dim"], + proj_dim=config.model_params["proj_dim"], + log_input=config.model_params.get("log_input", False), + use_torch_spec=config.model_params.get("use_torch_spec", False), + audio_config=config.audio, ) return model From 3c6d7f495cf4d7fb7b7d97c96615d406ffd38f2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 13 Dec 2021 16:30:15 +0000 Subject: [PATCH 197/220] Fixup --- TTS/speaker_encoder/models/resnet.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index 3b96f270..92d34494 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -1,12 +1,11 @@ import numpy as np import torch +import torchaudio from torch import nn from TTS.utils.audio import TorchSTFT from TTS.utils.io import load_fsspec -import torchaudio - class PreEmphasis(nn.Module): def __init__(self, coefficient=0.97): @@ -134,7 +133,7 @@ class ResNetSpeakerEncoder(nn.Module): hop_length=audio_config["hop_length"], window_fn=torch.hamming_window, n_mels=audio_config["num_mels"], - ) + ), ) else: self.torch_spec = None From 4c50f6f4df6a2ed11958662fef9fdf226239e402 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 16 Dec 2021 14:53:57 +0000 Subject: [PATCH 198/220] Add functions to get and check and argument in config and config.model_args --- TTS/bin/train_tts.py | 8 ++++---- TTS/config/__init__.py | 24 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 191cba00..3360a940 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -1,6 +1,6 @@ import os -from TTS.config import load_config, register_config +from TTS.config import check_config_and_model_args, get_from_config_or_model_args, load_config, register_config from TTS.trainer import Trainer, TrainingArgs from TTS.tts.datasets import load_tts_samples from TTS.tts.models import setup_model @@ -46,14 +46,14 @@ def main(): ap = AudioProcessor(**config.audio) # init speaker manager - if config.use_speaker_embedding: + if check_config_and_model_args(config, "use_speaker_embedding", True): speaker_manager = SpeakerManager(data_items=train_samples + eval_samples) if hasattr(config, "model_args"): config.model_args.num_speakers = speaker_manager.num_speakers else: config.num_speakers = speaker_manager.num_speakers - elif config.use_d_vector_file: - speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file) + elif check_config_and_model_args(config, "use_d_vector_file", True): + speaker_manager = SpeakerManager(d_vectors_file_path=get_from_config_or_model_args(config, "d_vector_file")) if hasattr(config, "model_args"): config.model_args.num_speakers = speaker_manager.num_speakers else: diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py index f626163f..65950de6 100644 --- a/TTS/config/__init__.py +++ b/TTS/config/__init__.py @@ -95,3 +95,27 @@ def load_config(config_path: str) -> None: config = config_class() config.from_dict(config_dict) return config + + +def check_config_and_model_args(config, arg_name, value): + """Check the give argument in `config.model_args` if exist or in `config` for + the given value. + + It is to patch up the compatibility between models with and without `model_args`. + + TODO: Remove this in the future with a unified approach. + """ + if hasattr(config, "model_args"): + if arg_name in config.model_args: + return config.model_args[arg_name] == value + if hasattr(config, arg_name): + return config[arg_name] == value + raise ValueError(f" [!] {arg_name} is not found in config or config.model_args") + + +def get_from_config_or_model_args(config, arg_name): + """Get the given argument from `config.model_args` if exist or in `config`.""" + if hasattr(config, "model_args"): + if arg_name in config.model_args: + return config.model_args[arg_name] + return config[arg_name] From 4d13b887f5c5758799696c0d6cc2ff35f1905700 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 16 Dec 2021 14:55:43 +0000 Subject: [PATCH 199/220] Change speaker_idx to speaker_name --- TTS/server/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/server/server.py b/TTS/server/server.py index f7bc79c4..2c6bebfd 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -165,7 +165,7 @@ def tts(): style_wav = style_wav_uri_to_dict(style_wav) print(" > Model input: {}".format(text)) - wavs = synthesizer.tts(text, speaker_idx=speaker_idx, style_wav=style_wav) + wavs = synthesizer.tts(text, speaker_name=speaker_idx, style_wav=style_wav) out = io.BytesIO() synthesizer.save_wav(wavs, out) return send_file(out, mimetype="audio/wav") From d29c3780d1479718b2c9cd549aedfed27cda536a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 16 Dec 2021 14:56:34 +0000 Subject: [PATCH 200/220] Use speaker_encoder from speaker manager in Vits --- TTS/speaker_encoder/models/resnet.py | 3 +-- TTS/tts/models/vits.py | 31 +++++++++++----------------- 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index 92d34494..7a384ef5 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -3,7 +3,7 @@ import torch import torchaudio from torch import nn -from TTS.utils.audio import TorchSTFT +# from TTS.utils.audio import TorchSTFT from TTS.utils.io import load_fsspec @@ -258,7 +258,6 @@ class ResNetSpeakerEncoder(nn.Module): if return_mean: embeddings = torch.mean(embeddings, dim=0, keepdim=True) - return embeddings def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False): diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 7f83f452..ddf6800f 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -406,42 +406,32 @@ class Vits(BaseTTS): raise RuntimeError( " [!] To use the speaker consistency loss (SCL) you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!" ) - self.speaker_manager.init_speaker_encoder( - config.speaker_encoder_model_path, config.speaker_encoder_config_path - ) - self.speaker_encoder = self.speaker_manager.speaker_encoder.train() - for param in self.speaker_encoder.parameters(): - param.requires_grad = False + self.speaker_manager.speaker_encoder.eval() print(" > External Speaker Encoder Loaded !!") if ( - hasattr(self.speaker_encoder, "audio_config") - and self.config.audio["sample_rate"] != self.speaker_encoder.audio_config["sample_rate"] + hasattr(self.speaker_manager.speaker_encoder, "audio_config") + and self.config.audio["sample_rate"] != self.speaker_manager.speaker_encoder.audio_config["sample_rate"] ): # TODO: change this with torchaudio Resample raise RuntimeError( " [!] To use the speaker consistency loss (SCL) you need to have matching sample rates between the TTS model ({}) and the speaker encoder ({})!".format( - self.config.audio["sample_rate"], self.speaker_encoder.audio_config["sample_rate"] + self.config.audio["sample_rate"], + self.speaker_manager.speaker_encoder.audio_config["sample_rate"], ) ) # pylint: disable=W0101,W0105 """ self.audio_transform = torchaudio.transforms.Resample( orig_freq=self.audio_config["sample_rate"], - new_freq=self.speaker_encoder.audio_config["sample_rate"], + new_freq=self.speaker_manager.speaker_encoder.audio_config["sample_rate"], ) else: self.audio_transform = None """ - else: - # self.audio_transform = None - self.speaker_encoder = None def _init_speaker_embedding(self, config): # pylint: disable=attribute-defined-outside-init - if config.speakers_file is not None: - self.speaker_manager = SpeakerManager(speaker_id_file_path=config.speakers_file) - if self.num_speakers > 0: print(" > initialization of speaker-embedding layers.") self.embedded_speaker_dim = config.speaker_embedding_channels @@ -451,7 +441,6 @@ class Vits(BaseTTS): # pylint: disable=attribute-defined-outside-init if hasattr(self, "emb_g"): raise ValueError("[!] Speaker embedding layer already initialized before d_vector settings.") - self.speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file) self.embedded_speaker_dim = config.d_vector_dim def init_multilingual(self, config: Coqpit): @@ -644,7 +633,7 @@ class Vits(BaseTTS): self.args.spec_segment_size * self.config.audio.hop_length, ) - if self.args.use_speaker_encoder_as_loss and self.speaker_encoder is not None: + if self.args.use_speaker_encoder_as_loss and self.speaker_manager.speaker_encoder is not None: # concate generated and GT waveforms wavs_batch = torch.cat((wav_seg, o), dim=0).squeeze(1) @@ -653,7 +642,7 @@ class Vits(BaseTTS): """if self.audio_transform is not None: wavs_batch = self.audio_transform(wavs_batch)""" - pred_embs = self.speaker_encoder.forward(wavs_batch, l2_norm=True) + pred_embs = self.speaker_manager.speaker_encoder.forward(wavs_batch, l2_norm=True) # split generated and GT speaker embeddings gt_spk_emb, syn_spk_emb = torch.chunk(pred_embs, 2, dim=0) @@ -1024,6 +1013,10 @@ class Vits(BaseTTS): ): # pylint: disable=unused-argument, redefined-builtin """Load the model checkpoint and setup for training or inference""" state = torch.load(checkpoint_path, map_location=torch.device("cpu")) + # compat band-aid for the pre-trained models to not use the encoder baked into the model + # TODO: consider baking the speaker encoder into the model and call it from there. + # as it is probably easier for model distribution. + state["model"] = {k: v for k, v in state["model"].items() if "speaker_encoder" not in k} self.load_state_dict(state["model"]) if eval: self.eval() From 473414d4afe3e6cceecda13705b2fe84be8f2302 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 16 Dec 2021 14:57:24 +0000 Subject: [PATCH 201/220] Implement init_speaker_encoder and change arg names --- TTS/utils/synthesizer.py | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index bd90dd8c..62540ae2 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -1,5 +1,5 @@ import time -from typing import List +from typing import List, Union import numpy as np import pysbd @@ -117,6 +117,7 @@ class Synthesizer(object): speaker_manager = self._init_speaker_manager() language_manager = self._init_language_manager() + speaker_manager = self._init_speaker_encoder(speaker_manager) if language_manager is not None: self.tts_model = setup_tts_model( @@ -130,23 +131,47 @@ class Synthesizer(object): if use_cuda: self.tts_model.cuda() + def _is_use_speaker_embedding(self): + """Check if the speaker embedding is used in the model""" + # some models use model_args some don't + if hasattr(self.tts_config, "model_args"): + config = self.tts_config.model_args + else: + config = self.tts_config + return hasattr(config, "use_speaker_embedding") and config.use_speaker_embedding is True + + def _is_use_d_vector_file(self): + """Check if the d-vector file is used in the model""" + # some models use model_args some don't + if hasattr(self.tts_config, "model_args"): + config = self.tts_config.model_args + else: + config = self.tts_config + return hasattr(config, "use_d_vector_file") and config.use_d_vector_file is True + def _init_speaker_manager(self): """Initialize the SpeakerManager""" # setup if multi-speaker settings are in the global model config speaker_manager = None - if hasattr(self.tts_config, "use_speaker_embedding") and self.tts_config.use_speaker_embedding is True: + if self._is_use_speaker_embedding(): if self.tts_speakers_file: speaker_manager = SpeakerManager(speaker_id_file_path=self.tts_speakers_file) if self.tts_config.get("speakers_file", None): speaker_manager = SpeakerManager(speaker_id_file_path=self.tts_config.speakers_file) - if hasattr(self.tts_config, "use_d_vector_file") and self.tts_config.use_speaker_embedding is True: + if self._is_use_d_vector_file(): if self.tts_speakers_file: speaker_manager = SpeakerManager(d_vectors_file_path=self.tts_speakers_file) if self.tts_config.get("d_vector_file", None): speaker_manager = SpeakerManager(d_vectors_file_path=self.tts_config.d_vector_file) return speaker_manager + def _init_speaker_encoder(self, speaker_manager): + """Initialize the SpeakerEncoder""" + if self.encoder_checkpoint is not None: + speaker_manager.init_speaker_encoder(self.encoder_checkpoint, self.encoder_config) + return speaker_manager + def _init_language_manager(self): """Initialize the LanguageManager""" # setup if multi-lingual settings are in the global model config @@ -203,7 +228,12 @@ class Synthesizer(object): self.ap.save_wav(wav, path, self.output_sample_rate) def tts( - self, text: str, speaker_name: str = "", language_name: str = "", speaker_wav=None, style_wav=None + self, + text: str, + speaker_name: str = "", + language_name: str = "", + speaker_wav: Union[str, List[str]] = None, + style_wav=None, ) -> List[int]: """🐸 TTS magic. Run all the models and generate speech. @@ -211,7 +241,7 @@ class Synthesizer(object): text (str): input text. speaker_name (str, optional): spekaer id for multi-speaker models. Defaults to "". language_name (str, optional): language id for multi-language models. Defaults to "". - speaker_wav (): + speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None. style_wav ([type], optional): style waveform for GST. Defaults to None. Returns: From e57c117323163d3bd21698bfa85bff06b999e33d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 16 Dec 2021 14:57:54 +0000 Subject: [PATCH 202/220] Add torchaudio to requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 453c3ec4..ddb6def9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,3 +27,4 @@ gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=2.0.0 fsspec>=2021.04.0 pyworld webrtcvad +torchaudio From a25269d89778f89f966acb37b448062a721af82f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 20 Dec 2021 11:52:40 +0000 Subject: [PATCH 203/220] Remove commented code --- TTS/tts/utils/speakers.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 8f15aada..07f94116 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -255,9 +255,6 @@ class SpeakerManager: self.speaker_encoder = setup_speaker_encoder_model(self.speaker_encoder_config) self.speaker_encoder.load_checkpoint(config_path, model_path, eval=True, use_cuda=self.use_cuda) self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio) - # normalize the input audio level and trim silences - # self.speaker_encoder_ap.do_sound_norm = True - # self.speaker_encoder_ap.do_trim_silence = True def compute_d_vector_from_clip(self, wav_file: Union[str, list]) -> list: """Compute a d_vector from a given audio file. From f7695951128bdcf52a235610b840390ec3d91da3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 20 Dec 2021 11:53:44 +0000 Subject: [PATCH 204/220] Add more listing options to ModelManager --- TTS/utils/manage.py | 72 ++++++++++++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 21 deletions(-) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index cfbbdff0..d1dedbe0 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -46,36 +46,66 @@ class ModelManager(object): with open(file_path, "r", encoding="utf-8") as json_file: self.models_dict = json.load(json_file) - def list_langs(self): - print(" Name format: type/language") - for model_type in self.models_dict: - for lang in self.models_dict[model_type]: - print(f" >: {model_type}/{lang} ") + def _list_models(self, model_type, model_count=0): + model_list = [] + for lang in self.models_dict[model_type]: + for dataset in self.models_dict[model_type][lang]: + for model in self.models_dict[model_type][lang][dataset]: + model_full_name = f"{model_type}--{lang}--{dataset}--{model}" + output_path = os.path.join(self.output_prefix, model_full_name) + if os.path.exists(output_path): + print(f" {model_count}: {model_type}/{lang}/{dataset}/{model} [already downloaded]") + else: + print(f" {model_count}: {model_type}/{lang}/{dataset}/{model}") + model_list.append(f"{model_type}/{lang}/{dataset}/{model}") + model_count += 1 + return model_list - def list_datasets(self): - print(" Name format: type/language/dataset") - for model_type in self.models_dict: - for lang in self.models_dict[model_type]: - for dataset in self.models_dict[model_type][lang]: - print(f" >: {model_type}/{lang}/{dataset}") + def _list_for_model_type(self, model_type): + print(" Name format: language/dataset/model") + models_name_list = [] + model_count = 1 + model_type = "tts_models" + models_name_list.extend(self._list_models(model_type, model_count)) + return [name.replace(model_type + "/", "") for name in models_name_list] def list_models(self): print(" Name format: type/language/dataset/model") models_name_list = [] model_count = 1 + for model_type in self.models_dict: + model_list = self._list_models(model_type, model_count) + models_name_list.extend(model_list) + return models_name_list + + def list_tts_models(self): + """Print all `TTS` models and return a list of model names + + Format is `language/dataset/model` + """ + return self._list_for_model_type("tts_models") + + def list_vocoder_models(self): + """Print all the `vocoder` models and return a list of model names + + Format is `language/dataset/model` + """ + return self._list_for_model_type("vocoder_models") + + def list_langs(self): + """Print all the available languages""" + print(" Name format: type/language") + for model_type in self.models_dict: + for lang in self.models_dict[model_type]: + print(f" >: {model_type}/{lang} ") + + def list_datasets(self): + """Print all the datasets""" + print(" Name format: type/language/dataset") for model_type in self.models_dict: for lang in self.models_dict[model_type]: for dataset in self.models_dict[model_type][lang]: - for model in self.models_dict[model_type][lang][dataset]: - model_full_name = f"{model_type}--{lang}--{dataset}--{model}" - output_path = os.path.join(self.output_prefix, model_full_name) - if os.path.exists(output_path): - print(f" {model_count}: {model_type}/{lang}/{dataset}/{model} [already downloaded]") - else: - print(f" {model_count}: {model_type}/{lang}/{dataset}/{model}") - models_name_list.append(f"{model_type}/{lang}/{dataset}/{model}") - model_count += 1 - return models_name_list + print(f" >: {model_type}/{lang}/{dataset}") def download_model(self, model_name): """Download model files given the full model name. From c9c1fa05487a8843ed4fadce3995332dc5dbfbea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 21 Dec 2021 09:44:07 +0000 Subject: [PATCH 205/220] Fix multi-speaker init in Synthesizer --- TTS/config/__init__.py | 5 +++-- TTS/utils/synthesizer.py | 27 +++++++++++++-------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py index 65950de6..8ed3578f 100644 --- a/TTS/config/__init__.py +++ b/TTS/config/__init__.py @@ -101,7 +101,8 @@ def check_config_and_model_args(config, arg_name, value): """Check the give argument in `config.model_args` if exist or in `config` for the given value. - It is to patch up the compatibility between models with and without `model_args`. + Return False if the argument does not exist in `config.model_args` or `config`. + This is to patch up the compatibility between models with and without `model_args`. TODO: Remove this in the future with a unified approach. """ @@ -110,7 +111,7 @@ def check_config_and_model_args(config, arg_name, value): return config.model_args[arg_name] == value if hasattr(config, arg_name): return config[arg_name] == value - raise ValueError(f" [!] {arg_name} is not found in config or config.model_args") + return False def get_from_config_or_model_args(config, arg_name): diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 62540ae2..905f50d7 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -5,7 +5,7 @@ import numpy as np import pysbd import torch -from TTS.config import load_config +from TTS.config import check_config_and_model_args, load_config from TTS.tts.models import setup_model as setup_tts_model from TTS.tts.utils.languages import LanguageManager from TTS.tts.utils.speakers import SpeakerManager @@ -133,21 +133,23 @@ class Synthesizer(object): def _is_use_speaker_embedding(self): """Check if the speaker embedding is used in the model""" - # some models use model_args some don't + # we handle here the case that some models use model_args some don't + use_speaker_embedding = False if hasattr(self.tts_config, "model_args"): - config = self.tts_config.model_args - else: - config = self.tts_config - return hasattr(config, "use_speaker_embedding") and config.use_speaker_embedding is True + use_speaker_embedding = self.tts_config["model_args"].get("use_speaker_embedding", False) + use_speaker_embedding = use_speaker_embedding or self.tts_config.get("use_speaker_embedding", False) + return use_speaker_embedding def _is_use_d_vector_file(self): """Check if the d-vector file is used in the model""" - # some models use model_args some don't + # we handle here the case that some models use model_args some don't + use_d_vector_file = False if hasattr(self.tts_config, "model_args"): config = self.tts_config.model_args - else: - config = self.tts_config - return hasattr(config, "use_d_vector_file") and config.use_d_vector_file is True + use_d_vector_file = config.get("use_d_vector_file", False) + config = self.tts_config + use_d_vector_file = use_d_vector_file or config.get("use_d_vector_file", False) + return use_d_vector_file def _init_speaker_manager(self): """Initialize the SpeakerManager""" @@ -176,10 +178,7 @@ class Synthesizer(object): """Initialize the LanguageManager""" # setup if multi-lingual settings are in the global model config language_manager = None - if ( - hasattr(self.tts_config.model_args, "use_language_embedding") - and self.tts_config.model_args.use_language_embedding is True - ): + if check_config_and_model_args(self.tts_config, "use_language_embedding", True): if self.tts_languages_file: language_manager = LanguageManager(language_ids_file_path=self.tts_languages_file) elif self.tts_config.get("language_ids_file", None): From 56378b12f7fbf2bc49c6e1911bc9027e3b279f26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 21 Dec 2021 12:19:32 +0000 Subject: [PATCH 206/220] Fix speaker encoder init --- TTS/bin/train_encoder.py | 4 ++-- TTS/server/server.py | 10 +++++++++- TTS/utils/synthesizer.py | 2 +- tests/aux_tests/test_speaker_manager.py | 4 ++-- 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index ad6d95f7..8c364300 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -11,7 +11,7 @@ from torch.utils.data import DataLoader from TTS.speaker_encoder.dataset import SpeakerEncoderDataset from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss -from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model +from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_speaker_encoder_model from TTS.speaker_encoder.utils.training import init_training from TTS.speaker_encoder.utils.visual import plot_embeddings from TTS.tts.datasets import load_tts_samples @@ -151,7 +151,7 @@ def main(args): # pylint: disable=redefined-outer-name global meta_data_eval ap = AudioProcessor(**c.audio) - model = setup_model(c) + model = setup_speaker_encoder_model(c) optimizer = RAdam(model.parameters(), lr=c.lr) diff --git a/TTS/server/server.py b/TTS/server/server.py index 2c6bebfd..f2512582 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -100,7 +100,15 @@ if args.vocoder_path is not None: # load models synthesizer = Synthesizer( - model_path, config_path, speakers_file_path, None, vocoder_path, vocoder_config_path, use_cuda=args.use_cuda + tts_checkpoint=model_path, + tts_config_path=config_path, + tts_speakers_file=speakers_file_path, + tts_languages_file=None, + vocoder_checkpoint=vocoder_path, + vocoder_config=vocoder_config_path, + encoder_checkpoint="", + encoder_config="", + use_cuda=args.use_cuda, ) use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and synthesizer.tts_model.num_speakers > 1 diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 905f50d7..db54027d 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -170,7 +170,7 @@ class Synthesizer(object): def _init_speaker_encoder(self, speaker_manager): """Initialize the SpeakerEncoder""" - if self.encoder_checkpoint is not None: + if self.encoder_checkpoint: speaker_manager.init_speaker_encoder(self.encoder_checkpoint, self.encoder_config) return speaker_manager diff --git a/tests/aux_tests/test_speaker_manager.py b/tests/aux_tests/test_speaker_manager.py index baa50749..b56c5258 100644 --- a/tests/aux_tests/test_speaker_manager.py +++ b/tests/aux_tests/test_speaker_manager.py @@ -6,7 +6,7 @@ import torch from tests import get_tests_input_path from TTS.config import load_config -from TTS.speaker_encoder.utils.generic_utils import setup_model +from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model from TTS.speaker_encoder.utils.io import save_checkpoint from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor @@ -28,7 +28,7 @@ class SpeakerManagerTest(unittest.TestCase): config.audio.resample = True # create a dummy speaker encoder - model = setup_model(config) + model = setup_speaker_encoder_model(config) save_checkpoint(model, None, None, get_tests_input_path(), 0) # load audio processor and speaker encoder From 633dcc9c563fc160923f5291b03a7ea89e014d7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 22 Dec 2021 15:51:14 +0000 Subject: [PATCH 207/220] Implement RMS volume normalization --- TTS/config/shared_configs.py | 9 +++++++ TTS/utils/audio.py | 52 +++++++++++++++++++++++++----------- 2 files changed, 46 insertions(+), 15 deletions(-) diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index c52cfe8a..9e9d4692 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -60,6 +60,12 @@ class BaseAudioConfig(Coqpit): trim_db (int): Silence threshold used for silence trimming. Defaults to 45. + do_rms_norm (bool, optional): + enable/disable RMS volume normalization when loading an audio file. Defaults to False. + + db_level (int, optional): + dB level used for rms normalization. The range is -99 to 0. Defaults to None. + power (float): Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the artifacts in the synthesized voice. Defaults to 1.5. @@ -116,6 +122,9 @@ class BaseAudioConfig(Coqpit): # silence trimming do_trim_silence: bool = True trim_db: int = 45 + # rms volume normalization + do_rms_norm: bool = False + db_level: float = None # griffin-lim params power: float = 1.5 griffin_lim_iters: int = 60 diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index d01196c4..25f93c34 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -266,6 +266,12 @@ class AudioProcessor(object): do_amp_to_db_mel (bool, optional): enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True. + do_rms_norm (bool, optional): + enable/disable RMS volume normalization when loading an audio file. Defaults to False. + + db_level (int, optional): + dB level used for rms normalization. The range is -99 to 0. Defaults to None. + stats_path (str, optional): Path to the computed stats file. Defaults to None. @@ -303,6 +309,8 @@ class AudioProcessor(object): do_sound_norm=False, do_amp_to_db_linear=True, do_amp_to_db_mel=True, + do_rms_norm=False, + db_level=None, stats_path=None, verbose=True, **_, @@ -334,6 +342,8 @@ class AudioProcessor(object): self.do_sound_norm = do_sound_norm self.do_amp_to_db_linear = do_amp_to_db_linear self.do_amp_to_db_mel = do_amp_to_db_mel + self.do_rms_norm = do_rms_norm + self.db_level = db_level self.stats_path = stats_path # setup exp_func for db to amp conversion if log_func == "np.log": @@ -726,21 +736,6 @@ class AudioProcessor(object): frame_period=1000 * self.hop_length / self.sample_rate, ) f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate) - # pad = int((self.win_length / self.hop_length) / 2) - # f0 = [0.0] * pad + f0 + [0.0] * pad - # f0 = np.pad(f0, (pad, pad), mode="constant", constant_values=0) - # f0 = np.array(f0, dtype=np.float32) - - # f01, _, _ = librosa.pyin( - # x, - # fmin=65 if self.mel_fmin == 0 else self.mel_fmin, - # fmax=self.mel_fmax, - # frame_length=self.win_length, - # sr=self.sample_rate, - # fill_na=0.0, - # ) - - # spec = self.melspectrogram(x) return f0 ### Audio Processing ### @@ -783,10 +778,33 @@ class AudioProcessor(object): """ return x / abs(x).max() * 0.95 + @staticmethod + def _rms_norm(wav, db_level=-27): + r = 10 ** (db_level / 20) + a = np.sqrt((len(wav) * (r ** 2)) / np.sum(wav ** 2)) + return wav * a + + def rms_volume_norm(self, x: np.ndarray, db_level: float = None) -> np.ndarray: + """Normalize the volume based on RMS of the signal. + + Args: + x (np.ndarray): Raw waveform. + + Returns: + np.ndarray: RMS normalized waveform. + """ + if db_level is None: + db_level = self.db_level + assert -99 <= db_level <= 0, " [!] db_level should be between -99 and 0" + wav = self._rms_norm(x, db_level) + return wav + ### save and load ### def load_wav(self, filename: str, sr: int = None) -> np.ndarray: """Read a wav file using Librosa and optionally resample, silence trim, volume normalize. + Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before. + Args: filename (str): Path to the wav file. sr (int, optional): Sampling rate for resampling. Defaults to None. @@ -795,8 +813,10 @@ class AudioProcessor(object): np.ndarray: Loaded waveform. """ if self.resample: + # loading with resampling. It is significantly slower. x, sr = librosa.load(filename, sr=self.sample_rate) elif sr is None: + # SF is faster than librosa for loading files x, sr = sf.read(filename) assert self.sample_rate == sr, "%s vs %s" % (self.sample_rate, sr) else: @@ -808,6 +828,8 @@ class AudioProcessor(object): print(f" [!] File cannot be trimmed for silence - {filename}") if self.do_sound_norm: x = self.sound_norm(x) + if self.do_rms_norm: + x = self.rms_volume_norm(x, self.db_level) return x def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None: From 5c5ddd2ba77e6dd51ec3ba86aef8f5d1bdda1045 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 22 Dec 2021 15:51:53 +0000 Subject: [PATCH 208/220] Init speaker manager for speaker encoder --- TTS/tts/utils/speakers.py | 4 ++-- TTS/utils/synthesizer.py | 9 ++++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 07f94116..a6e15cdd 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -256,11 +256,11 @@ class SpeakerManager: self.speaker_encoder.load_checkpoint(config_path, model_path, eval=True, use_cuda=self.use_cuda) self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio) - def compute_d_vector_from_clip(self, wav_file: Union[str, list]) -> list: + def compute_d_vector_from_clip(self, wav_file: Union[str, List[str]]) -> list: """Compute a d_vector from a given audio file. Args: - wav_file (Union[str, list]): Target file path. + wav_file (Union[str, List[str]]): Target file path. Returns: list: Computed d_vector. diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index db54027d..7a2d3097 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -171,7 +171,12 @@ class Synthesizer(object): def _init_speaker_encoder(self, speaker_manager): """Initialize the SpeakerEncoder""" if self.encoder_checkpoint: - speaker_manager.init_speaker_encoder(self.encoder_checkpoint, self.encoder_config) + if speaker_manager is None: + speaker_manager = SpeakerManager( + encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config + ) + else: + speaker_manager.init_speaker_encoder(self.encoder_checkpoint, self.encoder_config) return speaker_manager def _init_language_manager(self): @@ -183,6 +188,8 @@ class Synthesizer(object): language_manager = LanguageManager(language_ids_file_path=self.tts_languages_file) elif self.tts_config.get("language_ids_file", None): language_manager = LanguageManager(language_ids_file_path=self.tts_config.language_ids_file) + else: + language_manager = LanguageManager(config=self.tts_config) return language_manager def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None: From 55ce7f0df10fb9ad108f7d86929069899bdb1214 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 22 Dec 2021 15:53:44 +0000 Subject: [PATCH 209/220] Update .gitignore --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 64d1f0d5..7e9da0d8 100644 --- a/.gitignore +++ b/.gitignore @@ -128,6 +128,8 @@ core recipes/WIP/* recipes/ljspeech/LJSpeech-1.1/* recipes/vctk/VCTK/* +recipes/**/*.npy +recipes/**/*.json VCTK-Corpus-removed-silence/* # ignore training logs @@ -161,4 +163,5 @@ speakers.json internal/* *_pitch.npy *_phoneme.npy -wandb \ No newline at end of file +wandb +depot/* \ No newline at end of file From 2033e17c4434ec17f4186539785796fde099a805 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 29 Dec 2021 16:51:40 +0000 Subject: [PATCH 210/220] Add VITS model tests --- tests/inputs/language_ids.json | 5 + tests/tts_tests/test_vits.py | 214 +++++++++++++++++++++++++++++++++ 2 files changed, 219 insertions(+) create mode 100644 tests/inputs/language_ids.json create mode 100644 tests/tts_tests/test_vits.py diff --git a/tests/inputs/language_ids.json b/tests/inputs/language_ids.json new file mode 100644 index 00000000..27bb1520 --- /dev/null +++ b/tests/inputs/language_ids.json @@ -0,0 +1,5 @@ +{ + "en": 0, + "fr-fr": 1, + "pt-br": 2 +} \ No newline at end of file diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py new file mode 100644 index 00000000..335472a5 --- /dev/null +++ b/tests/tts_tests/test_vits.py @@ -0,0 +1,214 @@ +import os +import torch +import unittest +from TTS.config import load_config +from TTS.tts.models.vits import Vits, VitsArgs +from TTS.tts.configs.vits_config import VitsConfig +from TTS.tts.utils.speakers import SpeakerManager +from tests import assertHasAttr, assertHasNotAttr, get_tests_input_path +from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model + + +LANG_FILE = os.path.join(get_tests_input_path(), "language_ids.json") +SPEAKER_ENCODER_CONFIG = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json") + + +torch.manual_seed(1) +use_cuda = torch.cuda.is_available() +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + +class TestVits(unittest.TestCase): + + def test_init_multispeaker(self): + num_speakers = 10 + args = VitsArgs(num_speakers=num_speakers, use_speaker_embedding=True) + model = Vits(args) + assertHasAttr(self, model, 'emb_g') + + args = VitsArgs(num_speakers=0, use_speaker_embedding=True) + model = Vits(args) + assertHasNotAttr(self, model, 'emb_g') + + args = VitsArgs(num_speakers=10, use_speaker_embedding=False) + model = Vits(args) + assertHasNotAttr(self, model, 'emb_g') + + args = VitsArgs(d_vector_dim=101, use_d_vector_file=True) + model = Vits(args) + self.assertEqual(model.embedded_speaker_dim, 101) + + def test_init_multilingual(self): + args = VitsArgs(language_ids_file=None, use_language_embedding=False) + model = Vits(args) + self.assertEqual(model.language_manager, None) + self.assertEqual(model.embedded_language_dim, 0) + self.assertEqual(model.emb_l, None) + + args = VitsArgs(language_ids_file=LANG_FILE) + model = Vits(args) + self.assertNotEqual(model.language_manager, None) + self.assertEqual(model.embedded_language_dim, 0) + self.assertEqual(model.emb_l, None) + + args = VitsArgs(language_ids_file=LANG_FILE, use_language_embedding=True) + model = Vits(args) + self.assertNotEqual(model.language_manager, None) + self.assertEqual(model.embedded_language_dim, args.embedded_language_dim) + self.assertNotEqual(model.emb_l, None) + + args = VitsArgs(language_ids_file=LANG_FILE, use_language_embedding=True, embedded_language_dim=102) + model = Vits(args) + self.assertNotEqual(model.language_manager, None) + self.assertEqual(model.embedded_language_dim, args.embedded_language_dim) + self.assertNotEqual(model.emb_l, None) + + def test_get_aux_input(self): + aux_input = {"speaker_ids": None, "style_wav": None, "d_vectors": None, "language_ids": None} + args = VitsArgs() + model = Vits(args) + aux_out= model.get_aux_input(aux_input) + + speaker_id = torch.randint(10, (1,)) + language_id = torch.randint(10, (1,)) + d_vector = torch.rand(1, 128) + aux_input = {"speaker_ids": speaker_id, "style_wav": None, "d_vectors": d_vector, "language_ids": language_id} + aux_out = model.get_aux_input(aux_input) + self.assertEqual(aux_out["speaker_ids"].shape, speaker_id.shape) + self.assertEqual(aux_out["language_ids"].shape, language_id.shape) + self.assertEqual(aux_out["d_vectors"].shape, d_vector.unsqueeze(0).transpose(2, 1).shape) + + def test_voice_conversion(self): + num_speakers = 10 + spec_len = 101 + spec_effective_len = 50 + + args = VitsArgs(num_speakers=num_speakers, use_speaker_embedding=True) + model = Vits(args) + + ref_inp = torch.randn(1, spec_len, 513) + ref_inp_len = torch.randint(1, spec_effective_len, (1,)) + ref_spk_id = torch.randint(0, num_speakers, (1,)) + tgt_spk_id = torch.randint(0, num_speakers, (1,)) + o_hat, y_mask, (z, z_p, z_hat) = model.voice_conversion(ref_inp, ref_inp_len, ref_spk_id, tgt_spk_id) + + self.assertEqual(o_hat.shape, (1, 1, spec_len * 256)) + self.assertEqual(y_mask.shape, (1, 1, spec_len)) + self.assertEqual(y_mask.sum(), ref_inp_len[0]) + self.assertEqual(z.shape, (1, args.hidden_channels, spec_len)) + self.assertEqual(z_p.shape, (1, args.hidden_channels, spec_len)) + self.assertEqual(z_hat.shape, (1, args.hidden_channels, spec_len)) + + def _init_inputs(self, config): + input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) + input_lengths = torch.randint(100, 129, (8,)).long().to(device) + input_lengths[-1] = 128 + spec = torch.rand(8, config.audio["fft_size"] // 2 + 1, 30).to(device) + spec_lengths = torch.randint(20, 30, (8,)).long().to(device) + spec_lengths[-1] = spec.size(2) + waveform = torch.rand(8, 1, spec.size(2) * config.audio["hop_length"]).to(device) + return input_dummy, input_lengths, spec, spec_lengths, waveform + + def _check_forward_outputs(self, config, output_dict, encoder_config=None): + self.assertEqual(output_dict['model_outputs'].shape[2], config.model_args.spec_segment_size * config.audio["hop_length"]) + self.assertEqual(output_dict["alignments"].shape, (8, 128, 30)) + self.assertEqual(output_dict["alignments"].max(), 1) + self.assertEqual(output_dict["alignments"].min(), 0) + self.assertEqual(output_dict["z"].shape, (8, config.model_args.hidden_channels, 30)) + self.assertEqual(output_dict["z_p"].shape, (8, config.model_args.hidden_channels, 30)) + self.assertEqual(output_dict["m_p"].shape, (8, config.model_args.hidden_channels, 30)) + self.assertEqual(output_dict["logs_p"].shape, (8, config.model_args.hidden_channels, 30)) + self.assertEqual(output_dict["m_q"].shape, (8, config.model_args.hidden_channels, 30)) + self.assertEqual(output_dict["logs_q"].shape, (8, config.model_args.hidden_channels, 30)) + self.assertEqual(output_dict['waveform_seg'].shape[2], config.model_args.spec_segment_size * config.audio["hop_length"]) + if encoder_config: + self.assertEqual(output_dict['gt_spk_emb'].shape, (8, encoder_config.model_params["proj_dim"])) + self.assertEqual(output_dict['syn_spk_emb'].shape, (8, encoder_config.model_params["proj_dim"])) + else: + self.assertEqual(output_dict['gt_spk_emb'], None) + self.assertEqual(output_dict['syn_spk_emb'], None) + + def test_forward(self): + num_speakers = 0 + config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True) + config.model_args.spec_segment_size = 10 + input_dummy, input_lengths, spec, spec_lengths, waveform = self._init_inputs(config) + model = Vits(config).to(device) + output_dict = model.forward(input_dummy, input_lengths, spec, spec_lengths, waveform) + self._check_forward_outputs(config, output_dict) + + def test_multispeaker_forward(self): + num_speakers = 10 + + config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True) + config.model_args.spec_segment_size = 10 + + input_dummy, input_lengths, spec, spec_lengths, waveform = self._init_inputs(config) + speaker_ids = torch.randint(0, num_speakers, (8,)).long().to(device) + + model = Vits(config).to(device) + output_dict = model.forward(input_dummy, input_lengths, spec, spec_lengths, waveform, aux_input={"speaker_ids": speaker_ids}) + self._check_forward_outputs(config, output_dict) + + def test_multilingual_forward(self): + num_speakers = 10 + num_langs = 3 + + args = VitsArgs(language_ids_file=LANG_FILE, use_language_embedding=True, spec_segment_size=10) + config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True, model_args=args) + + input_dummy, input_lengths, spec, spec_lengths, waveform = self._init_inputs(config) + speaker_ids = torch.randint(0, num_speakers, (8,)).long().to(device) + lang_ids = torch.randint(0, num_langs, (8,)).long().to(device) + + model = Vits(config).to(device) + output_dict = model.forward(input_dummy, input_lengths, spec, spec_lengths, waveform, aux_input={"speaker_ids": speaker_ids, "language_ids": lang_ids}) + self._check_forward_outputs(config, output_dict) + + def test_secl_forward(self): + num_speakers = 10 + num_langs = 3 + + speaker_encoder_config = load_config(SPEAKER_ENCODER_CONFIG) + speaker_encoder_config.model_params["use_torch_spec"] = True + speaker_encoder = setup_speaker_encoder_model(speaker_encoder_config).to(device) + speaker_manager = SpeakerManager() + speaker_manager.speaker_encoder = speaker_encoder + + args = VitsArgs(language_ids_file=LANG_FILE, use_language_embedding=True, spec_segment_size=10, use_speaker_encoder_as_loss=True) + config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True, model_args=args) + config.audio.sample_rate = 16000 + + input_dummy, input_lengths, spec, spec_lengths, waveform = self._init_inputs(config) + speaker_ids = torch.randint(0, num_speakers, (8,)).long().to(device) + lang_ids = torch.randint(0, num_langs, (8,)).long().to(device) + + model = Vits(config, speaker_manager=speaker_manager).to(device) + output_dict = model.forward(input_dummy, input_lengths, spec, spec_lengths, waveform, aux_input={"speaker_ids": speaker_ids, "language_ids": lang_ids}) + self._check_forward_outputs(config, output_dict, speaker_encoder_config) + + def test_inference(self): + num_speakers = 0 + config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True) + input_dummy = torch.randint(0, 24, (1, 128)).long().to(device) + model = Vits(config).to(device) + _ = model.inference(input_dummy) + + def test_multispeaker_inference(self): + num_speakers = 10 + config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True) + input_dummy = torch.randint(0, 24, (1, 128)).long().to(device) + speaker_ids = torch.randint(0, num_speakers, (1,)).long().to(device) + model = Vits(config).to(device) + _ = model.inference(input_dummy, {"speaker_ids": speaker_ids}) + + def test_multilingual_inference(self): + num_speakers = 10 + num_langs = 3 + args = VitsArgs(language_ids_file=LANG_FILE, use_language_embedding=True, spec_segment_size=10) + config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True, model_args=args) + input_dummy = torch.randint(0, 24, (1, 128)).long().to(device) + speaker_ids = torch.randint(0, num_speakers, (1,)).long().to(device) + lang_ids = torch.randint(0, num_langs, (1,)).long().to(device) + model = Vits(config).to(device) + _ = model.inference(input_dummy, {"speaker_ids": speaker_ids, "language_ids": lang_ids}) \ No newline at end of file From 638091f41d289908e1f0a435eb26668c06e3ef05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 30 Dec 2021 12:02:06 +0000 Subject: [PATCH 211/220] Update Speaker Encoder models --- TTS/speaker_encoder/dataset.py | 2 +- TTS/speaker_encoder/models/lstm.py | 79 +++++++++++++++++++--- TTS/speaker_encoder/models/resnet.py | 19 +++++- TTS/speaker_encoder/utils/generic_utils.py | 2 + 4 files changed, 88 insertions(+), 14 deletions(-) diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index 6b2b0dd4..5b0fee22 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -250,4 +250,4 @@ class SpeakerEncoderDataset(Dataset): feats = torch.stack(feats) labels = torch.stack(labels) - return feats.transpose(1, 2), labels + return feats, labels diff --git a/TTS/speaker_encoder/models/lstm.py b/TTS/speaker_encoder/models/lstm.py index de5bb007..3c2eafee 100644 --- a/TTS/speaker_encoder/models/lstm.py +++ b/TTS/speaker_encoder/models/lstm.py @@ -1,7 +1,9 @@ import numpy as np import torch +import torchaudio from torch import nn +from TTS.speaker_encoder.models.resnet import PreEmphasis from TTS.utils.io import load_fsspec @@ -33,9 +35,21 @@ class LSTMWithoutProjection(nn.Module): class LSTMSpeakerEncoder(nn.Module): - def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3, use_lstm_with_projection=True): + def __init__( + self, + input_dim, + proj_dim=256, + lstm_dim=768, + num_lstm_layers=3, + use_lstm_with_projection=True, + use_torch_spec=False, + audio_config=None, + ): super().__init__() self.use_lstm_with_projection = use_lstm_with_projection + self.use_torch_spec = use_torch_spec + self.audio_config = audio_config + layers = [] # choise LSTM layer if use_lstm_with_projection: @@ -46,6 +60,38 @@ class LSTMSpeakerEncoder(nn.Module): else: self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers) + self.instancenorm = nn.InstanceNorm1d(input_dim) + + if self.use_torch_spec: + self.torch_spec = torch.nn.Sequential( + PreEmphasis(audio_config["preemphasis"]), + # TorchSTFT( + # n_fft=audio_config["fft_size"], + # hop_length=audio_config["hop_length"], + # win_length=audio_config["win_length"], + # sample_rate=audio_config["sample_rate"], + # window="hamming_window", + # mel_fmin=0.0, + # mel_fmax=None, + # use_htk=True, + # do_amp_to_db=False, + # n_mels=audio_config["num_mels"], + # power=2.0, + # use_mel=True, + # mel_norm=None, + # ) + torchaudio.transforms.MelSpectrogram( + sample_rate=audio_config["sample_rate"], + n_fft=audio_config["fft_size"], + win_length=audio_config["win_length"], + hop_length=audio_config["hop_length"], + window_fn=torch.hamming_window, + n_mels=audio_config["num_mels"], + ), + ) + else: + self.torch_spec = None + self._init_layers() def _init_layers(self): @@ -55,22 +101,33 @@ class LSTMSpeakerEncoder(nn.Module): elif "weight" in name: nn.init.xavier_normal_(param) - def forward(self, x): - # TODO: implement state passing for lstms + def forward(self, x, l2_norm=True): + """Forward pass of the model. + + Args: + x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True` + to compute the spectrogram on-the-fly. + l2_norm (bool): Whether to L2-normalize the outputs. + + Shapes: + - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})` + """ + with torch.no_grad(): + with torch.cuda.amp.autocast(enabled=False): + if self.use_torch_spec: + x.squeeze_(1) + x = self.torch_spec(x) + x = self.instancenorm(x).transpose(1, 2) d = self.layers(x) if self.use_lstm_with_projection: - d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1) - else: + d = d[:, -1] + if l2_norm: d = torch.nn.functional.normalize(d, p=2, dim=1) return d @torch.no_grad() - def inference(self, x): - d = self.layers.forward(x) - if self.use_lstm_with_projection: - d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1) - else: - d = torch.nn.functional.normalize(d, p=2, dim=1) + def inference(self, x, l2_norm=True): + d = self.layers.forward(x, l2_norm=l2_norm) return d def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True): diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index 7a384ef5..f1f13df1 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -190,8 +190,19 @@ class ResNetSpeakerEncoder(nn.Module): return out def forward(self, x, l2_norm=False): + """Forward pass of the model. + + Args: + x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True` + to compute the spectrogram on-the-fly. + l2_norm (bool): Whether to L2-normalize the outputs. + + Shapes: + - x: :math:`(N, 1, T_{in})` + """ with torch.no_grad(): with torch.cuda.amp.autocast(enabled=False): + x.squeeze_(1) # if you torch spec compute it otherwise use the mel spec computed by the AP if self.use_torch_spec: x = self.torch_spec(x) @@ -230,7 +241,11 @@ class ResNetSpeakerEncoder(nn.Module): return x @torch.no_grad() - def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True): + def inference(self, x, l2_norm=False): + return self.forward(x, l2_norm) + + @torch.no_grad() + def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True): """ Generate embeddings for a batch of utterances x: 1xTxD @@ -254,7 +269,7 @@ class ResNetSpeakerEncoder(nn.Module): frames_batch.append(frames) frames_batch = torch.cat(frames_batch, dim=0) - embeddings = self.forward(frames_batch, l2_norm=True) + embeddings = self.inference(frames_batch, l2_norm=l2_norm) if return_mean: embeddings = torch.mean(embeddings, dim=0, keepdim=True) diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/speaker_encoder/utils/generic_utils.py index dab79f3c..b8aa4093 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/speaker_encoder/utils/generic_utils.py @@ -177,6 +177,8 @@ def setup_speaker_encoder_model(config: "Coqpit"): config.model_params["proj_dim"], config.model_params["lstm_dim"], config.model_params["num_lstm_layers"], + use_torch_spec=config.model_params.get("use_torch_spec", False), + audio_config=config.audio, ) elif config.model_params["model_name"].lower() == "resnet": model = ResNetSpeakerEncoder( From 7129b04d469bc2324cc8d9ec26d96f34c18920a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 30 Dec 2021 12:02:35 +0000 Subject: [PATCH 212/220] Update VITS model --- TTS/tts/models/vits.py | 62 ++++++++++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index ddf6800f..2a2cedb5 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -385,24 +385,29 @@ class Vits(BaseTTS): """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer or with external `d_vectors` computed from a speaker encoder model. + You must provide a `speaker_manager` at initialization to set up the multi-speaker modules. + Args: config (Coqpit): Model configuration. data (List, optional): Dataset items to infer number of speakers. Defaults to None. """ self.embedded_speaker_dim = 0 - config = config.model_args + self.num_speakers = self.args.num_speakers - self.num_speakers = config.num_speakers + if self.speaker_manager: + self.num_speakers = self.speaker_manager.num_speakers - if config.use_speaker_embedding: + if self.args.use_speaker_embedding: self._init_speaker_embedding(config) - if config.use_d_vector_file: + if self.args.use_d_vector_file: self._init_d_vector(config) # TODO: make this a function - if config.use_speaker_encoder_as_loss: - if not config.speaker_encoder_model_path or not config.speaker_encoder_config_path: + if self.args.use_speaker_encoder_as_loss: + if self.speaker_manager.speaker_encoder is None and ( + not config.speaker_encoder_model_path or not config.speaker_encoder_config_path + ): raise RuntimeError( " [!] To use the speaker consistency loss (SCL) you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!" ) @@ -412,7 +417,8 @@ class Vits(BaseTTS): if ( hasattr(self.speaker_manager.speaker_encoder, "audio_config") - and self.config.audio["sample_rate"] != self.speaker_manager.speaker_encoder.audio_config["sample_rate"] + and self.config.audio["sample_rate"] + != self.speaker_manager.speaker_encoder.audio_config["sample_rate"] ): # TODO: change this with torchaudio Resample raise RuntimeError( @@ -434,14 +440,14 @@ class Vits(BaseTTS): # pylint: disable=attribute-defined-outside-init if self.num_speakers > 0: print(" > initialization of speaker-embedding layers.") - self.embedded_speaker_dim = config.speaker_embedding_channels + self.embedded_speaker_dim = self.args.speaker_embedding_channels self.emb_g = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) def _init_d_vector(self, config): # pylint: disable=attribute-defined-outside-init if hasattr(self, "emb_g"): raise ValueError("[!] Speaker embedding layer already initialized before d_vector settings.") - self.embedded_speaker_dim = config.d_vector_dim + self.embedded_speaker_dim = self.args.d_vector_dim def init_multilingual(self, config: Coqpit): """Initialize multilingual modules of a model. @@ -449,15 +455,12 @@ class Vits(BaseTTS): Args: config (Coqpit): Model configuration. """ - if hasattr(config, "model_args"): - config = config.model_args - - if config.language_ids_file is not None: + if self.args.language_ids_file is not None: self.language_manager = LanguageManager(language_ids_file_path=config.language_ids_file) - if config.use_language_embedding and self.language_manager: + if self.args.use_language_embedding and self.language_manager: self.num_languages = self.language_manager.num_languages - self.embedded_language_dim = config.embedded_language_dim + self.embedded_language_dim = self.args.embedded_language_dim self.emb_l = nn.Embedding(self.num_languages, self.embedded_language_dim) torch.nn.init.xavier_uniform_(self.emb_l.weight) else: @@ -486,7 +489,7 @@ class Vits(BaseTTS): def get_aux_input(self, aux_input: Dict): sid, g, lid = self._set_cond_input(aux_input) - return {"speaker_id": sid, "style_wav": None, "d_vector": g, "language_id": lid} + return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid} def get_aux_input_from_test_sentences(self, sentence_info): if hasattr(self.config, "model_args"): @@ -542,8 +545,8 @@ class Vits(BaseTTS): x_lengths: torch.tensor, y: torch.tensor, y_lengths: torch.tensor, + waveform: torch.tensor, aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None}, - waveform=None, ) -> Dict: """Forward pass of the model. @@ -552,6 +555,7 @@ class Vits(BaseTTS): x_lengths (torch.tensor): Batch of input character sequence lengths. y (torch.tensor): Batch of input spectrograms. y_lengths (torch.tensor): Batch of input spectrogram lengths. + waveform (torch.tensor): Batch of ground truth waveforms per sample. aux_input (dict, optional): Auxiliary inputs for multi-speaker and multi-lingual training. Defaults to {"d_vectors": None, "speaker_ids": None, "language_ids": None}. @@ -563,6 +567,7 @@ class Vits(BaseTTS): - x_lengths: :math:`[B]` - y: :math:`[B, C, T_spec]` - y_lengths: :math:`[B]` + - waveform: :math:`[B, T_wav, 1]` - d_vectors: :math:`[B, C, 1]` - speaker_ids: :math:`[B]` - language_ids: :math:`[B]` @@ -628,14 +633,14 @@ class Vits(BaseTTS): o = self.waveform_decoder(z_slice, g=g) wav_seg = segment( - waveform.transpose(1, 2), + waveform, slice_ids * self.config.audio.hop_length, self.args.spec_segment_size * self.config.audio.hop_length, ) if self.args.use_speaker_encoder_as_loss and self.speaker_manager.speaker_encoder is not None: # concate generated and GT waveforms - wavs_batch = torch.cat((wav_seg, o), dim=0).squeeze(1) + wavs_batch = torch.cat((wav_seg, o), dim=0) # resample audio to speaker encoder sample_rate # pylint: disable=W0105 @@ -712,20 +717,29 @@ class Vits(BaseTTS): return outputs def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt): - """TODO: create an end-point for voice conversion""" + """Forward pass for voice conversion + + TODO: create an end-point for voice conversion + + Args: + y (Tensor): Reference spectrograms. Tensor of shape [B, T, C] + y_lengths (Tensor): Length of each reference spectrogram. Tensor of shape [B] + speaker_cond_src (Tensor): Reference speaker ID. Tensor of shape [B,] + speaker_cond_tgt (Tensor): Target speaker ID. Tensor of shape [B,] + """ assert self.num_speakers > 0, "num_speakers have to be larger than 0." # speaker embedding - if self.args.use_speaker_embedding and not self.use_d_vector: + if self.args.use_speaker_embedding and not self.args.use_d_vector_file: g_src = self.emb_g(speaker_cond_src).unsqueeze(-1) g_tgt = self.emb_g(speaker_cond_tgt).unsqueeze(-1) - elif self.args.use_speaker_embedding and self.use_d_vector: + elif self.args.use_speaker_embedding and self.args.use_d_vector_file: g_src = F.normalize(speaker_cond_src).unsqueeze(-1) g_tgt = F.normalize(speaker_cond_tgt).unsqueeze(-1) else: raise RuntimeError(" [!] Voice conversion is only supported on multi-speaker models.") - z, _, _, y_mask = self.posterior_encoder(y, y_lengths, g=g_src) + z, _, _, y_mask = self.posterior_encoder(y.transpose(1, 2), y_lengths, g=g_src) z_p = self.flow(z, y_mask, g=g_src) z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) o_hat = self.waveform_decoder(z_hat * y_mask, g=g_tgt) @@ -786,8 +800,8 @@ class Vits(BaseTTS): text_lengths, linear_input.transpose(1, 2), mel_lengths, + waveform.transpose(1, 2), aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids}, - waveform=waveform, ) # cache tensors for the discriminator From 497332bd46138040303a64d2546a91f9a7e784a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 30 Dec 2021 12:03:12 +0000 Subject: [PATCH 213/220] Add custom asserts to tests --- tests/__init__.py | 11 +++++ tests/tts_tests/test_vits.py | 78 ++++++++++++++++++++++++------------ 2 files changed, 63 insertions(+), 26 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index 45aee23a..0a0c3379 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -38,3 +38,14 @@ def run_cli(command): def get_test_data_config(): return BaseDatasetConfig(name="ljspeech", path="tests/data/ljspeech/", meta_file_train="metadata.csv") + + +def assertHasAttr(test_obj, obj, intendedAttr): + # from https://stackoverflow.com/questions/48078636/pythons-unittest-lacks-an-asserthasattr-method-what-should-i-use-instead + testBool = hasattr(obj, intendedAttr) + test_obj.assertTrue(testBool, msg=f"obj lacking an attribute. obj: {obj}, intendedAttr: {intendedAttr}") + + +def assertHasNotAttr(test_obj, obj, intendedAttr): + testBool = hasattr(obj, intendedAttr) + test_obj.assertFalse(testBool, msg=f"obj should not have an attribute. obj: {obj}, intendedAttr: {intendedAttr}") diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py index 335472a5..de075a5c 100644 --- a/tests/tts_tests/test_vits.py +++ b/tests/tts_tests/test_vits.py @@ -1,13 +1,14 @@ import os -import torch import unittest -from TTS.config import load_config -from TTS.tts.models.vits import Vits, VitsArgs -from TTS.tts.configs.vits_config import VitsConfig -from TTS.tts.utils.speakers import SpeakerManager -from tests import assertHasAttr, assertHasNotAttr, get_tests_input_path -from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model +import torch + +from tests import assertHasAttr, assertHasNotAttr, get_tests_input_path +from TTS.config import load_config +from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model +from TTS.tts.configs.vits_config import VitsConfig +from TTS.tts.models.vits import Vits, VitsArgs +from TTS.tts.utils.speakers import SpeakerManager LANG_FILE = os.path.join(get_tests_input_path(), "language_ids.json") SPEAKER_ENCODER_CONFIG = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json") @@ -18,21 +19,21 @@ use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +#pylint: disable=no-self-use class TestVits(unittest.TestCase): - def test_init_multispeaker(self): num_speakers = 10 args = VitsArgs(num_speakers=num_speakers, use_speaker_embedding=True) model = Vits(args) - assertHasAttr(self, model, 'emb_g') + assertHasAttr(self, model, "emb_g") args = VitsArgs(num_speakers=0, use_speaker_embedding=True) model = Vits(args) - assertHasNotAttr(self, model, 'emb_g') + assertHasNotAttr(self, model, "emb_g") args = VitsArgs(num_speakers=10, use_speaker_embedding=False) model = Vits(args) - assertHasNotAttr(self, model, 'emb_g') + assertHasNotAttr(self, model, "emb_g") args = VitsArgs(d_vector_dim=101, use_d_vector_file=True) model = Vits(args) @@ -67,12 +68,12 @@ class TestVits(unittest.TestCase): aux_input = {"speaker_ids": None, "style_wav": None, "d_vectors": None, "language_ids": None} args = VitsArgs() model = Vits(args) - aux_out= model.get_aux_input(aux_input) + aux_out = model.get_aux_input(aux_input) speaker_id = torch.randint(10, (1,)) language_id = torch.randint(10, (1,)) d_vector = torch.rand(1, 128) - aux_input = {"speaker_ids": speaker_id, "style_wav": None, "d_vectors": d_vector, "language_ids": language_id} + aux_input = {"speaker_ids": speaker_id, "style_wav": None, "d_vectors": d_vector, "language_ids": language_id} aux_out = model.get_aux_input(aux_input) self.assertEqual(aux_out["speaker_ids"].shape, speaker_id.shape) self.assertEqual(aux_out["language_ids"].shape, language_id.shape) @@ -88,8 +89,8 @@ class TestVits(unittest.TestCase): ref_inp = torch.randn(1, spec_len, 513) ref_inp_len = torch.randint(1, spec_effective_len, (1,)) - ref_spk_id = torch.randint(0, num_speakers, (1,)) - tgt_spk_id = torch.randint(0, num_speakers, (1,)) + ref_spk_id = torch.randint(1, num_speakers, (1,)) + tgt_spk_id = torch.randint(1, num_speakers, (1,)) o_hat, y_mask, (z, z_p, z_hat) = model.voice_conversion(ref_inp, ref_inp_len, ref_spk_id, tgt_spk_id) self.assertEqual(o_hat.shape, (1, 1, spec_len * 256)) @@ -110,7 +111,9 @@ class TestVits(unittest.TestCase): return input_dummy, input_lengths, spec, spec_lengths, waveform def _check_forward_outputs(self, config, output_dict, encoder_config=None): - self.assertEqual(output_dict['model_outputs'].shape[2], config.model_args.spec_segment_size * config.audio["hop_length"]) + self.assertEqual( + output_dict["model_outputs"].shape[2], config.model_args.spec_segment_size * config.audio["hop_length"] + ) self.assertEqual(output_dict["alignments"].shape, (8, 128, 30)) self.assertEqual(output_dict["alignments"].max(), 1) self.assertEqual(output_dict["alignments"].min(), 0) @@ -120,13 +123,15 @@ class TestVits(unittest.TestCase): self.assertEqual(output_dict["logs_p"].shape, (8, config.model_args.hidden_channels, 30)) self.assertEqual(output_dict["m_q"].shape, (8, config.model_args.hidden_channels, 30)) self.assertEqual(output_dict["logs_q"].shape, (8, config.model_args.hidden_channels, 30)) - self.assertEqual(output_dict['waveform_seg'].shape[2], config.model_args.spec_segment_size * config.audio["hop_length"]) + self.assertEqual( + output_dict["waveform_seg"].shape[2], config.model_args.spec_segment_size * config.audio["hop_length"] + ) if encoder_config: - self.assertEqual(output_dict['gt_spk_emb'].shape, (8, encoder_config.model_params["proj_dim"])) - self.assertEqual(output_dict['syn_spk_emb'].shape, (8, encoder_config.model_params["proj_dim"])) + self.assertEqual(output_dict["gt_spk_emb"].shape, (8, encoder_config.model_params["proj_dim"])) + self.assertEqual(output_dict["syn_spk_emb"].shape, (8, encoder_config.model_params["proj_dim"])) else: - self.assertEqual(output_dict['gt_spk_emb'], None) - self.assertEqual(output_dict['syn_spk_emb'], None) + self.assertEqual(output_dict["gt_spk_emb"], None) + self.assertEqual(output_dict["syn_spk_emb"], None) def test_forward(self): num_speakers = 0 @@ -147,7 +152,9 @@ class TestVits(unittest.TestCase): speaker_ids = torch.randint(0, num_speakers, (8,)).long().to(device) model = Vits(config).to(device) - output_dict = model.forward(input_dummy, input_lengths, spec, spec_lengths, waveform, aux_input={"speaker_ids": speaker_ids}) + output_dict = model.forward( + input_dummy, input_lengths, spec, spec_lengths, waveform, aux_input={"speaker_ids": speaker_ids} + ) self._check_forward_outputs(config, output_dict) def test_multilingual_forward(self): @@ -162,7 +169,14 @@ class TestVits(unittest.TestCase): lang_ids = torch.randint(0, num_langs, (8,)).long().to(device) model = Vits(config).to(device) - output_dict = model.forward(input_dummy, input_lengths, spec, spec_lengths, waveform, aux_input={"speaker_ids": speaker_ids, "language_ids": lang_ids}) + output_dict = model.forward( + input_dummy, + input_lengths, + spec, + spec_lengths, + waveform, + aux_input={"speaker_ids": speaker_ids, "language_ids": lang_ids}, + ) self._check_forward_outputs(config, output_dict) def test_secl_forward(self): @@ -175,7 +189,12 @@ class TestVits(unittest.TestCase): speaker_manager = SpeakerManager() speaker_manager.speaker_encoder = speaker_encoder - args = VitsArgs(language_ids_file=LANG_FILE, use_language_embedding=True, spec_segment_size=10, use_speaker_encoder_as_loss=True) + args = VitsArgs( + language_ids_file=LANG_FILE, + use_language_embedding=True, + spec_segment_size=10, + use_speaker_encoder_as_loss=True, + ) config = VitsConfig(num_speakers=num_speakers, use_speaker_embedding=True, model_args=args) config.audio.sample_rate = 16000 @@ -184,7 +203,14 @@ class TestVits(unittest.TestCase): lang_ids = torch.randint(0, num_langs, (8,)).long().to(device) model = Vits(config, speaker_manager=speaker_manager).to(device) - output_dict = model.forward(input_dummy, input_lengths, spec, spec_lengths, waveform, aux_input={"speaker_ids": speaker_ids, "language_ids": lang_ids}) + output_dict = model.forward( + input_dummy, + input_lengths, + spec, + spec_lengths, + waveform, + aux_input={"speaker_ids": speaker_ids, "language_ids": lang_ids}, + ) self._check_forward_outputs(config, output_dict, speaker_encoder_config) def test_inference(self): @@ -211,4 +237,4 @@ class TestVits(unittest.TestCase): speaker_ids = torch.randint(0, num_speakers, (1,)).long().to(device) lang_ids = torch.randint(0, num_langs, (1,)).long().to(device) model = Vits(config).to(device) - _ = model.inference(input_dummy, {"speaker_ids": speaker_ids, "language_ids": lang_ids}) \ No newline at end of file + _ = model.inference(input_dummy, {"speaker_ids": speaker_ids, "language_ids": lang_ids}) From 348b5c96a2f696525e5c8ef32163c2a3d7208423 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 30 Dec 2021 12:36:30 +0000 Subject: [PATCH 214/220] Fix speaker encoder test --- TTS/speaker_encoder/models/lstm.py | 2 +- TTS/tts/models/vits.py | 8 ++++---- TTS/tts/utils/speakers.py | 2 +- tests/aux_tests/test_speaker_encoder.py | 8 ++++---- tests/aux_tests/test_speaker_manager.py | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/TTS/speaker_encoder/models/lstm.py b/TTS/speaker_encoder/models/lstm.py index 3c2eafee..7ac08514 100644 --- a/TTS/speaker_encoder/models/lstm.py +++ b/TTS/speaker_encoder/models/lstm.py @@ -127,7 +127,7 @@ class LSTMSpeakerEncoder(nn.Module): @torch.no_grad() def inference(self, x, l2_norm=True): - d = self.layers.forward(x, l2_norm=l2_norm) + d = self.forward(x, l2_norm=l2_norm) return d def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True): diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 2a2cedb5..e4e64240 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -398,10 +398,10 @@ class Vits(BaseTTS): self.num_speakers = self.speaker_manager.num_speakers if self.args.use_speaker_embedding: - self._init_speaker_embedding(config) + self._init_speaker_embedding() if self.args.use_d_vector_file: - self._init_d_vector(config) + self._init_d_vector() # TODO: make this a function if self.args.use_speaker_encoder_as_loss: @@ -436,14 +436,14 @@ class Vits(BaseTTS): self.audio_transform = None """ - def _init_speaker_embedding(self, config): + def _init_speaker_embedding(self): # pylint: disable=attribute-defined-outside-init if self.num_speakers > 0: print(" > initialization of speaker-embedding layers.") self.embedded_speaker_dim = self.args.speaker_embedding_channels self.emb_g = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) - def _init_d_vector(self, config): + def _init_d_vector(self): # pylint: disable=attribute-defined-outside-init if hasattr(self, "emb_g"): raise ValueError("[!] Speaker embedding layer already initialized before d_vector settings.") diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index a6e15cdd..07076d90 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -270,7 +270,7 @@ class SpeakerManager: waveform = self.speaker_encoder_ap.load_wav(wav_file, sr=self.speaker_encoder_ap.sample_rate) if not self.speaker_encoder_config.model_params.get("use_torch_spec", False): m_input = self.speaker_encoder_ap.melspectrogram(waveform) - m_input = torch.from_numpy(m_input.T) + m_input = torch.from_numpy(m_input) else: m_input = torch.from_numpy(waveform) diff --git a/tests/aux_tests/test_speaker_encoder.py b/tests/aux_tests/test_speaker_encoder.py index 3c897aa9..97b3b92f 100644 --- a/tests/aux_tests/test_speaker_encoder.py +++ b/tests/aux_tests/test_speaker_encoder.py @@ -13,7 +13,7 @@ file_path = get_tests_input_path() class LSTMSpeakerEncoderTests(unittest.TestCase): # pylint: disable=R0201 def test_in_out(self): - dummy_input = T.rand(4, 20, 80) # B x T x D + dummy_input = T.rand(4, 80, 20) # B x D x T dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)] model = LSTMSpeakerEncoder(input_dim=80, proj_dim=256, lstm_dim=768, num_lstm_layers=3) # computing d vectors @@ -34,7 +34,7 @@ class LSTMSpeakerEncoderTests(unittest.TestCase): assert output.type() == "torch.FloatTensor" assert abs(assert_diff) < 1e-4, f" [!] output_norm has wrong values - {assert_diff}" # compute d for a given batch - dummy_input = T.rand(1, 240, 80) # B x T x D + dummy_input = T.rand(1, 80, 240) # B x T x D output = model.compute_embedding(dummy_input, num_frames=160, num_eval=5) assert output.shape[0] == 1 assert output.shape[1] == 256 @@ -44,7 +44,7 @@ class LSTMSpeakerEncoderTests(unittest.TestCase): class ResNetSpeakerEncoderTests(unittest.TestCase): # pylint: disable=R0201 def test_in_out(self): - dummy_input = T.rand(4, 20, 80) # B x T x D + dummy_input = T.rand(4, 80, 20) # B x D x T dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)] model = ResNetSpeakerEncoder(input_dim=80, proj_dim=256) # computing d vectors @@ -61,7 +61,7 @@ class ResNetSpeakerEncoderTests(unittest.TestCase): assert output.type() == "torch.FloatTensor" assert abs(assert_diff) < 1e-4, f" [!] output_norm has wrong values - {assert_diff}" # compute d for a given batch - dummy_input = T.rand(1, 240, 80) # B x T x D + dummy_input = T.rand(1, 80, 240) # B x D x T output = model.compute_embedding(dummy_input, num_frames=160, num_eval=10) assert output.shape[0] == 1 assert output.shape[1] == 256 diff --git a/tests/aux_tests/test_speaker_manager.py b/tests/aux_tests/test_speaker_manager.py index b56c5258..fff49b13 100644 --- a/tests/aux_tests/test_speaker_manager.py +++ b/tests/aux_tests/test_speaker_manager.py @@ -38,7 +38,7 @@ class SpeakerManagerTest(unittest.TestCase): # load a sample audio and compute embedding waveform = ap.load_wav(sample_wav_path) mel = ap.melspectrogram(waveform) - d_vector = manager.compute_d_vector(mel.T) + d_vector = manager.compute_d_vector(mel) assert d_vector.shape[1] == 256 # compute d_vector directly from an input file From 36cef5966b50bf5596eb78f45c48025b8114411c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 30 Dec 2021 14:41:43 +0000 Subject: [PATCH 215/220] Fix resnet speaker encoder --- TTS/speaker_encoder/models/resnet.py | 4 +--- TTS/tts/models/vits.py | 3 +-- tests/tts_tests/test_vits.py | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index f1f13df1..643449c8 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -198,7 +198,7 @@ class ResNetSpeakerEncoder(nn.Module): l2_norm (bool): Whether to L2-normalize the outputs. Shapes: - - x: :math:`(N, 1, T_{in})` + - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})` """ with torch.no_grad(): with torch.cuda.amp.autocast(enabled=False): @@ -206,8 +206,6 @@ class ResNetSpeakerEncoder(nn.Module): # if you torch spec compute it otherwise use the mel spec computed by the AP if self.use_torch_spec: x = self.torch_spec(x) - else: - x = x.transpose(1, 2) if self.log_input: x = (x + 1e-6).log() diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index e4e64240..8b09fdf9 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -417,8 +417,7 @@ class Vits(BaseTTS): if ( hasattr(self.speaker_manager.speaker_encoder, "audio_config") - and self.config.audio["sample_rate"] - != self.speaker_manager.speaker_encoder.audio_config["sample_rate"] + and self.config.audio["sample_rate"] != self.speaker_manager.speaker_encoder.audio_config["sample_rate"] ): # TODO: change this with torchaudio Resample raise RuntimeError( diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py index de075a5c..4274d947 100644 --- a/tests/tts_tests/test_vits.py +++ b/tests/tts_tests/test_vits.py @@ -19,7 +19,7 @@ use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -#pylint: disable=no-self-use +# pylint: disable=no-self-use class TestVits(unittest.TestCase): def test_init_multispeaker(self): num_speakers = 10 From 8100135a7e9552f04a6e9ba6f517693ef9a5cceb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 31 Dec 2021 10:54:53 +0000 Subject: [PATCH 216/220] Add the YourTTS entry to the models --- TTS/.models.json | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/TTS/.models.json b/TTS/.models.json index 44c5fc6c..7567c15a 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -1,5 +1,17 @@ { "tts_models": { + "multilingual":{ + "multi-dataset":{ + "your_tts":{ + "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.4.0/tts_models--multilingual--multi-dataset--your_tts.zip", + "default_vocoder": null, + "commit": "e9a1953e", + "license": "CC BY-NC-ND 4.0", + "contact": "egolge@coqui.ai" + } + } + }, "en": { "ek1": { "tacotron2": { From 61874bc0a0c703df56b55e51b9d67784575f93f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 31 Dec 2021 13:45:05 +0000 Subject: [PATCH 217/220] Fix your_tts inference from the listed models --- TTS/config/__init__.py | 10 ++++++++++ TTS/utils/manage.py | 8 ++++++++ TTS/utils/synthesizer.py | 19 ++++++++++++++----- tests/zoo_tests/test_models.py | 11 ++++++++++- 4 files changed, 42 insertions(+), 6 deletions(-) diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py index 8ed3578f..5c905295 100644 --- a/TTS/config/__init__.py +++ b/TTS/config/__init__.py @@ -120,3 +120,13 @@ def get_from_config_or_model_args(config, arg_name): if arg_name in config.model_args: return config.model_args[arg_name] return config[arg_name] + + +def get_from_config_or_model_args_with_default(config, arg_name, def_val): + """Get the given argument from `config.model_args` if exist or in `config`.""" + if hasattr(config, "model_args"): + if arg_name in config.model_args: + return config.model_args[arg_name] + if hasattr(config, arg_name): + return config[arg_name] + return def_val diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index d1dedbe0..7ad596f0 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -151,6 +151,8 @@ class ModelManager(object): output_stats_path = os.path.join(output_path, "scale_stats.npy") output_d_vector_file_path = os.path.join(output_path, "speakers.json") output_speaker_ids_file_path = os.path.join(output_path, "speaker_ids.json") + speaker_encoder_config_path = os.path.join(output_path, "config_se.json") + speaker_encoder_model_path = os.path.join(output_path, "model_se.pth.tar") # update the scale_path.npy file path in the model config.json self._update_path("audio.stats_path", output_stats_path, config_path) @@ -163,6 +165,12 @@ class ModelManager(object): self._update_path("speakers_file", output_speaker_ids_file_path, config_path) self._update_path("model_args.speakers_file", output_speaker_ids_file_path, config_path) + # update the speaker_encoder file path in the model config.json to the current path + self._update_path("speaker_encoder_model_path", speaker_encoder_model_path, config_path) + self._update_path("model_args.speaker_encoder_model_path", speaker_encoder_model_path, config_path) + self._update_path("speaker_encoder_config_path", speaker_encoder_config_path, config_path) + self._update_path("model_args.speaker_encoder_config_path", speaker_encoder_config_path, config_path) + @staticmethod def _update_path(field_name, new_path, config_path): """Update the path in the model config.json for the current environment after download""" diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 7a2d3097..66579a1b 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -5,7 +5,7 @@ import numpy as np import pysbd import torch -from TTS.config import check_config_and_model_args, load_config +from TTS.config import check_config_and_model_args, load_config, get_from_config_or_model_args_with_default from TTS.tts.models import setup_model as setup_tts_model from TTS.tts.utils.languages import LanguageManager from TTS.tts.utils.speakers import SpeakerManager @@ -117,6 +117,7 @@ class Synthesizer(object): speaker_manager = self._init_speaker_manager() language_manager = self._init_language_manager() + self._set_speaker_encoder_paths_from_tts_config() speaker_manager = self._init_speaker_encoder(speaker_manager) if language_manager is not None: @@ -131,6 +132,12 @@ class Synthesizer(object): if use_cuda: self.tts_model.cuda() + def _set_speaker_encoder_paths_from_tts_config(self): + """Set the encoder paths from the tts model config for models with speaker encoders.""" + if hasattr(self.tts_config, "model_args") and hasattr(self.tts_config.model_args, "speaker_encoder_config_path"): + self.encoder_checkpoint = self.tts_config.model_args.speaker_encoder_model_path + self.encoder_config = self.tts_config.model_args.speaker_encoder_config_path + def _is_use_speaker_embedding(self): """Check if the speaker embedding is used in the model""" # we handle here the case that some models use model_args some don't @@ -155,17 +162,19 @@ class Synthesizer(object): """Initialize the SpeakerManager""" # setup if multi-speaker settings are in the global model config speaker_manager = None + speakers_file = get_from_config_or_model_args_with_default(self.tts_config, "speakers_file", None) if self._is_use_speaker_embedding(): if self.tts_speakers_file: speaker_manager = SpeakerManager(speaker_id_file_path=self.tts_speakers_file) - if self.tts_config.get("speakers_file", None): - speaker_manager = SpeakerManager(speaker_id_file_path=self.tts_config.speakers_file) + if speakers_file: + speaker_manager = SpeakerManager(speaker_id_file_path=speakers_file) if self._is_use_d_vector_file(): + d_vector_file = get_from_config_or_model_args_with_default(self.tts_config, "d_vector_file", None) if self.tts_speakers_file: speaker_manager = SpeakerManager(d_vectors_file_path=self.tts_speakers_file) - if self.tts_config.get("d_vector_file", None): - speaker_manager = SpeakerManager(d_vectors_file_path=self.tts_config.d_vector_file) + if d_vector_file: + speaker_manager = SpeakerManager(d_vectors_file_path=d_vector_file) return speaker_manager def _init_speaker_encoder(self, speaker_manager): diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py index 886d1bb6..43273572 100644 --- a/tests/zoo_tests/test_models.py +++ b/tests/zoo_tests/test_models.py @@ -2,6 +2,7 @@ import glob import os import shutil +from TTS.tts.utils.languages import LanguageManager from tests import get_tests_output_path, run_cli from TTS.tts.utils.speakers import SpeakerManager @@ -22,16 +23,24 @@ def test_run_all_models(): local_download_dir = os.path.dirname(model_path) # download and run the model speaker_files = glob.glob(local_download_dir + "/speaker*") + language_files = glob.glob(local_download_dir + "/language*") + language_id = "" if len(speaker_files) > 0: # multi-speaker model if "speaker_ids" in speaker_files[0]: speaker_manager = SpeakerManager(speaker_id_file_path=speaker_files[0]) elif "speakers" in speaker_files[0]: speaker_manager = SpeakerManager(d_vectors_file_path=speaker_files[0]) + + # multi-lingual model - Assuming multi-lingual models are also multi-speaker + if len(language_files) > 0 and "language_ids" in language_files[0]: + language_manager = LanguageManager(language_ids_file_path=language_files[0]) + language_id = language_manager.language_names[0] + speaker_id = list(speaker_manager.speaker_ids.keys())[0] run_cli( f"tts --model_name {model_name} " - f'--text "This is an example." --out_path "{output_path}" --speaker_idx "{speaker_id}"' + f'--text "This is an example." --out_path "{output_path}" --speaker_idx "{speaker_id}" --language_idx "{language_id}" ' ) else: # single-speaker model From 254c110ec13b4ad2ecb2914b84591c2604e1c1d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sat, 1 Jan 2022 13:57:01 +0000 Subject: [PATCH 218/220] Print testing model --- tests/zoo_tests/test_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py index 43273572..e77e750b 100644 --- a/tests/zoo_tests/test_models.py +++ b/tests/zoo_tests/test_models.py @@ -18,6 +18,7 @@ def test_run_all_models(): manager = ModelManager(output_prefix=get_tests_output_path()) model_names = manager.list_models() for model_name in model_names: + print(f"\n > Run - {model_name}") model_path, _, _ = manager.download_model(model_name) if "tts_models" in model_name: local_download_dir = os.path.dirname(model_path) From 8fd1ee1926a956a146188179baee143ef11a003d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sat, 1 Jan 2022 14:46:22 +0000 Subject: [PATCH 219/220] Print urls when BadZipError --- TTS/utils/manage.py | 10 +++++++--- TTS/utils/synthesizer.py | 6 ++++-- tests/zoo_tests/test_models.py | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 7ad596f0..b002da53 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -165,7 +165,7 @@ class ModelManager(object): self._update_path("speakers_file", output_speaker_ids_file_path, config_path) self._update_path("model_args.speakers_file", output_speaker_ids_file_path, config_path) - # update the speaker_encoder file path in the model config.json to the current path + # update the speaker_encoder file path in the model config.json to the current path self._update_path("speaker_encoder_model_path", speaker_encoder_model_path, config_path) self._update_path("model_args.speaker_encoder_model_path", speaker_encoder_model_path, config_path) self._update_path("speaker_encoder_config_path", speaker_encoder_config_path, config_path) @@ -197,8 +197,12 @@ class ModelManager(object): # download the file r = requests.get(file_url) # extract the file - with zipfile.ZipFile(io.BytesIO(r.content)) as z: - z.extractall(output_folder) + try: + with zipfile.ZipFile(io.BytesIO(r.content)) as z: + z.extractall(output_folder) + except zipfile.BadZipFile: + print(f" > Error: Bad zip file - {file_url}") + raise zipfile.BadZipFile # move the files to the outer path for file_path in z.namelist()[1:]: src_path = os.path.join(output_folder, file_path) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 66579a1b..d1d978d8 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -5,7 +5,7 @@ import numpy as np import pysbd import torch -from TTS.config import check_config_and_model_args, load_config, get_from_config_or_model_args_with_default +from TTS.config import check_config_and_model_args, get_from_config_or_model_args_with_default, load_config from TTS.tts.models import setup_model as setup_tts_model from TTS.tts.utils.languages import LanguageManager from TTS.tts.utils.speakers import SpeakerManager @@ -134,7 +134,9 @@ class Synthesizer(object): def _set_speaker_encoder_paths_from_tts_config(self): """Set the encoder paths from the tts model config for models with speaker encoders.""" - if hasattr(self.tts_config, "model_args") and hasattr(self.tts_config.model_args, "speaker_encoder_config_path"): + if hasattr(self.tts_config, "model_args") and hasattr( + self.tts_config.model_args, "speaker_encoder_config_path" + ): self.encoder_checkpoint = self.tts_config.model_args.speaker_encoder_model_path self.encoder_config = self.tts_config.model_args.speaker_encoder_config_path diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py index e77e750b..63d9e7ca 100644 --- a/tests/zoo_tests/test_models.py +++ b/tests/zoo_tests/test_models.py @@ -2,9 +2,9 @@ import glob import os import shutil -from TTS.tts.utils.languages import LanguageManager from tests import get_tests_output_path, run_cli +from TTS.tts.utils.languages import LanguageManager from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.generic_utils import get_user_data_dir from TTS.utils.manage import ModelManager From 33711afa0137c7d2acdac95b5e2dd8047d8a9bc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sat, 1 Jan 2022 15:37:08 +0000 Subject: [PATCH 220/220] Update yourTTS url --- TTS/.models.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/.models.json b/TTS/.models.json index 7567c15a..8459893c 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -4,7 +4,7 @@ "multi-dataset":{ "your_tts":{ "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.4.0/tts_models--multilingual--multi-dataset--your_tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.5.0_models/tts_models--multilingual--multi-dataset--your_tts.zip", "default_vocoder": null, "commit": "e9a1953e", "license": "CC BY-NC-ND 4.0",