From b96e74dd4906df02a4fba23519eb947491f96716 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 5 Aug 2020 17:35:58 +0200 Subject: [PATCH 01/56] add multi-speaker arguments to the model def --- mozilla_voice_tts/bin/synthesize.py | 2 +- mozilla_voice_tts/tts/configs/config.json | 13 ++- mozilla_voice_tts/tts/models/tacotron.py | 8 +- mozilla_voice_tts/tts/models/tacotron2.py | 102 +++++++++++------- .../tts/models/tacotron_abstract.py | 36 +++++-- mozilla_voice_tts/tts/utils/generic_utils.py | 17 ++- mozilla_voice_tts/tts/utils/synthesis.py | 25 +++-- mozilla_voice_tts/tts/utils/text/cleaners.py | 7 ++ 8 files changed, 145 insertions(+), 65 deletions(-) diff --git a/mozilla_voice_tts/bin/synthesize.py b/mozilla_voice_tts/bin/synthesize.py index 6f139433..b52db37e 100644 --- a/mozilla_voice_tts/bin/synthesize.py +++ b/mozilla_voice_tts/bin/synthesize.py @@ -20,7 +20,7 @@ from mozilla_voice_tts.vocoder.utils.generic_utils import setup_generator def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_id): t_1 = time.time() - waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, None, False, CONFIG.enable_eos_bos_chars, use_gl) + waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, CONFIG.gst['gst_style_input'], False, CONFIG.enable_eos_bos_chars, use_gl) if CONFIG.model == "Tacotron" and not use_gl: mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T if not use_gl: diff --git a/mozilla_voice_tts/tts/configs/config.json b/mozilla_voice_tts/tts/configs/config.json index cd4595b9..9068e2c4 100644 --- a/mozilla_voice_tts/tts/configs/config.json +++ b/mozilla_voice_tts/tts/configs/config.json @@ -132,8 +132,16 @@ // MULTI-SPEAKER and GST "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. - "style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference. - "use_gst": false, // TACOTRON ONLY: use global style tokens + "use_gst": true, // use global style tokens + "gst": { // gst parameter if gst is enabled + "gst_style_input": null, // Condition the style input either on a + // -> wave file [path to wave] or + // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} + // with the dictionary being len(dict) == len(gst_style_tokens). + "gst_embedding_dim": 512, + "gst_num_heads": 4, + "gst_style_tokens": 10 + }, // DATASETS "datasets": // List of datasets. They all merged and they get different speaker_ids. @@ -145,6 +153,5 @@ "meta_file_val": null } ] - } diff --git a/mozilla_voice_tts/tts/models/tacotron.py b/mozilla_voice_tts/tts/models/tacotron.py index 295dbeda..6e0ba09b 100644 --- a/mozilla_voice_tts/tts/models/tacotron.py +++ b/mozilla_voice_tts/tts/models/tacotron.py @@ -29,6 +29,9 @@ class Tacotron(TacotronAbstract): double_decoder_consistency=False, ddc_r=None, gst=False, + gst_embedding_dim=256, + gst_num_heads=4, + gst_style_tokens=10, memory_size=5): super(Tacotron, self).__init__(num_chars, num_speakers, r, postnet_output_dim, @@ -64,10 +67,9 @@ class Tacotron(TacotronAbstract): self.speaker_embeddings_projected = None # global style token layers if self.gst: - gst_embedding_dim = 256 self.gst_layer = GST(num_mel=80, - num_heads=4, - num_style_tokens=10, + num_heads=gst_num_heads, + num_style_tokens=gst_style_tokens, embedding_dim=gst_embedding_dim) # backward pass decoder if self.bidirectional_decoder: diff --git a/mozilla_voice_tts/tts/models/tacotron2.py b/mozilla_voice_tts/tts/models/tacotron2.py index 327e1bd9..52da2f39 100644 --- a/mozilla_voice_tts/tts/models/tacotron2.py +++ b/mozilla_voice_tts/tts/models/tacotron2.py @@ -28,7 +28,10 @@ class Tacotron2(TacotronAbstract): bidirectional_decoder=False, double_decoder_consistency=False, ddc_r=None, - gst=False): + gst=False, + gst_embedding_dim=512, + gst_num_heads=4, + gst_style_tokens=10): super(Tacotron2, self).__init__(num_chars, num_speakers, r, postnet_output_dim, decoder_output_dim, attn_type, attn_win, @@ -37,13 +40,17 @@ class Tacotron2(TacotronAbstract): location_attn, attn_K, separate_stopnet, bidirectional_decoder, double_decoder_consistency, ddc_r, gst) - decoder_in_features = 512 if num_speakers > 1 else 512 + + # init layer dims + speaker_embedding_dim = 512 if num_speakers > 1 else 0 + gst_embedding_dim = gst_embedding_dim if self.gst else 0 + decoder_in_features = 512+speaker_embedding_dim+gst_embedding_dim encoder_in_features = 512 if num_speakers > 1 else 512 proj_speaker_dim = 80 if num_speakers > 1 else 0 # base layers self.embedding = nn.Embedding(num_chars, 512, padding_idx=0) if num_speakers > 1: - self.speaker_embedding = nn.Embedding(num_speakers, 512) + self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) self.encoder = Encoder(encoder_in_features) self.decoder = Decoder(decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win, @@ -53,10 +60,9 @@ class Tacotron2(TacotronAbstract): self.postnet = Postnet(self.postnet_output_dim) # global style token layers if self.gst: - gst_embedding_dim = encoder_in_features self.gst_layer = GST(num_mel=80, - num_heads=4, - num_style_tokens=10, + num_heads=gst_num_heads, + num_style_tokens=gst_style_tokens, embedding_dim=gst_embedding_dim) # backward pass decoder if self.bidirectional_decoder: @@ -76,7 +82,6 @@ class Tacotron2(TacotronAbstract): return mel_outputs, mel_outputs_postnet, alignments def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None): - self._init_states() # compute mask for padding # B x T_in_max (boolean) input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) @@ -84,20 +89,24 @@ class Tacotron2(TacotronAbstract): embedded_inputs = self.embedding(text).transpose(1, 2) # B x T_in_max x D_en encoder_outputs = self.encoder(embedded_inputs, text_lengths) - # adding speaker embeddding to encoder output - # TODO: multi-speaker - # B x speaker_embed_dim - if speaker_ids is not None: - self.compute_speaker_embedding(speaker_ids) + if self.num_speakers > 1: - # B x T_in x embed_dim + speaker_embed_dim - encoder_outputs = self._add_speaker_embedding(encoder_outputs, - self.speaker_embeddings) + embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] + embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) + if hasattr(self, 'gst'): + # B x gst_dim + encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, mel_specs) + encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1) + else: + encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) + else: + if hasattr(self, 'gst'): + # B x gst_dim + encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, mel_specs) + encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1) + encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) - # global style token - if self.gst: - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs) + # B x mel_dim x T_out -- B x T_out//r x T_in -- B x T_out//r decoder_outputs, alignments, stop_tokens = self.decoder( encoder_outputs, mel_specs, input_mask) @@ -122,14 +131,25 @@ class Tacotron2(TacotronAbstract): return decoder_outputs, postnet_outputs, alignments, stop_tokens @torch.no_grad() - def inference(self, text, speaker_ids=None): + def inference(self, text, speaker_ids=None, style_mel=None): embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder.inference(embedded_inputs) - if speaker_ids is not None: - self.compute_speaker_embedding(speaker_ids) + if self.num_speakers > 1: - encoder_outputs = self._add_speaker_embedding(encoder_outputs, - self.speaker_embeddings) + embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] + embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) + if hasattr(self, 'gst'): + # B x gst_dim + encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) + encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1) + else: + encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) + else: + if hasattr(self, 'gst'): + # B x gst_dim + encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) + encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1) + decoder_outputs, alignments, stop_tokens = self.decoder.inference( encoder_outputs) postnet_outputs = self.postnet(decoder_outputs) @@ -138,14 +158,28 @@ class Tacotron2(TacotronAbstract): decoder_outputs, postnet_outputs, alignments) return decoder_outputs, postnet_outputs, alignments, stop_tokens - def inference_truncated(self, text, speaker_ids=None): + def inference_truncated(self, text, speaker_ids=None, style_mel=None): """ Preserve model states for continuous inference """ embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder.inference_truncated(embedded_inputs) - encoder_outputs = self._add_speaker_embedding(encoder_outputs, - speaker_ids) + + if self.num_speakers > 1: + embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] + embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) + if hasattr(self, 'gst'): + # B x gst_dim + encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) + encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1) + else: + encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) + else: + if hasattr(self, 'gst'): + # B x gst_dim + encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) + encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1) + mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated( encoder_outputs) mel_outputs_postnet = self.postnet(mel_outputs) @@ -153,17 +187,3 @@ class Tacotron2(TacotronAbstract): mel_outputs, mel_outputs_postnet, alignments = self.shape_outputs( mel_outputs, mel_outputs_postnet, alignments) return mel_outputs, mel_outputs_postnet, alignments, stop_tokens - - - def _speaker_embedding_pass(self, encoder_outputs, speaker_ids): - # TODO: multi-speaker - # if hasattr(self, "speaker_embedding") and speaker_ids is None: - # raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided") - # if hasattr(self, "speaker_embedding") and speaker_ids is not None: - - # speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0), - # encoder_outputs.size(1), - # -1) - # encoder_outputs = encoder_outputs + speaker_embeddings - # return encoder_outputs - pass diff --git a/mozilla_voice_tts/tts/models/tacotron_abstract.py b/mozilla_voice_tts/tts/models/tacotron_abstract.py index c9ae9b83..bc794d49 100644 --- a/mozilla_voice_tts/tts/models/tacotron_abstract.py +++ b/mozilla_voice_tts/tts/models/tacotron_abstract.py @@ -28,7 +28,10 @@ class TacotronAbstract(ABC, nn.Module): bidirectional_decoder=False, double_decoder_consistency=False, ddc_r=None, - gst=False): + gst=False, + gst_embedding_dim=512, + gst_num_heads=4, + gst_style_tokens=10): """ Abstract Tacotron class """ super().__init__() self.num_chars = num_chars @@ -36,6 +39,9 @@ class TacotronAbstract(ABC, nn.Module): self.decoder_output_dim = decoder_output_dim self.postnet_output_dim = postnet_output_dim self.gst = gst + self.gst_embedding_dim = gst_embedding_dim + self.gst_num_heads = gst_num_heads + self.gst_style_tokens = gst_style_tokens self.num_speakers = num_speakers self.bidirectional_decoder = bidirectional_decoder self.double_decoder_consistency = double_decoder_consistency @@ -158,12 +164,28 @@ class TacotronAbstract(ABC, nn.Module): self.speaker_embeddings_projected = self.speaker_project_mel( self.speaker_embeddings).squeeze(1) - def compute_gst(self, inputs, mel_specs): - """ Compute global style token """ - # pylint: disable=not-callable - gst_outputs = self.gst_layer(mel_specs) - inputs = self._add_speaker_embedding(inputs, gst_outputs) - return inputs + def compute_gst(self, inputs, style_input): + device = inputs.device + if isinstance(style_input, dict): + query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device) + _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) + gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) + for k_token, v_amplifier in style_input.items(): + key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) + gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) + gst_outputs = gst_outputs + gst_outputs_att * v_amplifier + elif style_input is None: + query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device) + _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) + gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) + for k_token in range(self.gst_style_tokens): + key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) + gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) + gst_outputs = gst_outputs + gst_outputs_att * 0 + else: + gst_outputs = self.gst_layer(style_input) + embedded_gst = gst_outputs.repeat(1, inputs.size(1), 1) + return inputs, embedded_gst @staticmethod def _add_speaker_embedding(outputs, speaker_embeddings): diff --git a/mozilla_voice_tts/tts/utils/generic_utils.py b/mozilla_voice_tts/tts/utils/generic_utils.py index e98c267d..212379a3 100644 --- a/mozilla_voice_tts/tts/utils/generic_utils.py +++ b/mozilla_voice_tts/tts/utils/generic_utils.py @@ -55,6 +55,9 @@ def setup_model(num_chars, num_speakers, c): postnet_output_dim=int(c.audio['fft_size'] / 2 + 1), decoder_output_dim=c.audio['num_mels'], gst=c.use_gst, + gst_embedding_dim=c.gst['gst_embedding_dim'], + gst_num_heads=c.gst['gst_num_heads'], + gst_style_tokens=c.gst['gst_style_tokens'], memory_size=c.memory_size, attn_type=c.attention_type, attn_win=c.windowing, @@ -77,6 +80,9 @@ def setup_model(num_chars, num_speakers, c): postnet_output_dim=c.audio['num_mels'], decoder_output_dim=c.audio['num_mels'], gst=c.use_gst, + gst_embedding_dim=c.gst['gst_embedding_dim'], + gst_num_heads=c.gst['gst_num_heads'], + gst_style_tokens=c.gst['gst_style_tokens'], attn_type=c.attention_type, attn_win=c.windowing, attn_norm=c.attention_norm, @@ -93,6 +99,7 @@ def setup_model(num_chars, num_speakers, c): ddc_r=c.ddc_r) return model + class KeepAverage(): def __init__(self): self.avg_values = {} @@ -239,10 +246,16 @@ def check_config(c): # paths check_argument('output_path', c, restricted=True, val_type=str) - # multi-speaker gst + # multi-speaker check_argument('use_speaker_embedding', c, restricted=True, val_type=bool) - check_argument('style_wav_for_test', c, restricted=True, val_type=str) + + # GST check_argument('use_gst', c, restricted=True, val_type=bool) + check_argument('gst_style_input', c, restricted=True, val_type=str) + check_argument('gst', c, restricted=True, val_type=dict) + check_argument('gst_embedding_dim', c['gst'], restricted=True, val_type=int, min_val=1) + check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=1) + check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1) # datasets - checking only the first entry check_argument('datasets', c, restricted=True, val_type=list) diff --git a/mozilla_voice_tts/tts/utils/synthesis.py b/mozilla_voice_tts/tts/utils/synthesis.py index fef2348d..6fed8f89 100644 --- a/mozilla_voice_tts/tts/utils/synthesis.py +++ b/mozilla_voice_tts/tts/utils/synthesis.py @@ -37,9 +37,11 @@ def numpy_to_tf(np_array, dtype): return tensor -def compute_style_mel(style_wav, ap): - style_mel = ap.melspectrogram( - ap.load_wav(style_wav)).expand_dims(0) +def compute_style_mel(style_wav, ap, cuda=False): + style_mel = torch.FloatTensor(ap.melspectrogram( + ap.load_wav(style_wav))).unsqueeze(0) + if cuda: + return style_mel.cuda() return style_mel @@ -129,10 +131,12 @@ def inv_spectrogram(postnet_output, ap, CONFIG): return wav -def id_to_torch(speaker_id): +def id_to_torch(speaker_id, cuda=False): if speaker_id is not None: speaker_id = np.asarray(speaker_id) speaker_id = torch.from_numpy(speaker_id).unsqueeze(0) + if cuda: + return speaker_id.cuda() return speaker_id @@ -185,14 +189,19 @@ def synthesis(model, """ # GST processing style_mel = None - if CONFIG.model == "TacotronGST" and style_wav is not None: - style_mel = compute_style_mel(style_wav, ap) + if CONFIG.use_gst and style_wav is not None: + if isinstance(style_wav, dict): + style_mel = style_wav + else: + style_mel = compute_style_mel(style_wav, ap, cuda=use_cuda) # preprocess the given text inputs = text_to_seqvec(text, CONFIG) # pass tensors to backend if backend == 'torch': - speaker_id = id_to_torch(speaker_id) - style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) + if speaker_id is not None: + speaker_id = id_to_torch(speaker_id, cuda=use_cuda) + if not isinstance(style_mel, dict): + style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) inputs = numpy_to_torch(inputs, torch.long, cuda=use_cuda) inputs = inputs.unsqueeze(0) elif backend == 'tf': diff --git a/mozilla_voice_tts/tts/utils/text/cleaners.py b/mozilla_voice_tts/tts/utils/text/cleaners.py index f0a66f57..dd329f9c 100644 --- a/mozilla_voice_tts/tts/utils/text/cleaners.py +++ b/mozilla_voice_tts/tts/utils/text/cleaners.py @@ -91,6 +91,13 @@ def transliteration_cleaners(text): return text +def basic_german_cleaners(text): + '''Pipeline for Turkish text''' + text = lowercase(text) + text = collapse_whitespace(text) + return text + + # TODO: elaborate it def basic_turkish_cleaners(text): '''Pipeline for Turkish text''' From 8d0d4919fdce201b138deb774ef5aec5cd60d04e Mon Sep 17 00:00:00 2001 From: SanjaESC Date: Fri, 10 Jul 2020 12:46:43 +0200 Subject: [PATCH 02/56] No need to query every token when none were passed --- mozilla_voice_tts/tts/configs/config.json | 2 +- mozilla_voice_tts/tts/models/tacotron_abstract.py | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/mozilla_voice_tts/tts/configs/config.json b/mozilla_voice_tts/tts/configs/config.json index 9068e2c4..8f56816e 100644 --- a/mozilla_voice_tts/tts/configs/config.json +++ b/mozilla_voice_tts/tts/configs/config.json @@ -137,7 +137,7 @@ "gst_style_input": null, // Condition the style input either on a // -> wave file [path to wave] or // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} - // with the dictionary being len(dict) == len(gst_style_tokens). + // with the dictionary being len(dict) <= len(gst_style_tokens). "gst_embedding_dim": 512, "gst_num_heads": 4, "gst_style_tokens": 10 diff --git a/mozilla_voice_tts/tts/models/tacotron_abstract.py b/mozilla_voice_tts/tts/models/tacotron_abstract.py index bc794d49..13c3e948 100644 --- a/mozilla_voice_tts/tts/models/tacotron_abstract.py +++ b/mozilla_voice_tts/tts/models/tacotron_abstract.py @@ -175,13 +175,7 @@ class TacotronAbstract(ABC, nn.Module): gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) gst_outputs = gst_outputs + gst_outputs_att * v_amplifier elif style_input is None: - query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device) - _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) - for k_token in range(self.gst_style_tokens): - key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) - gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) - gst_outputs = gst_outputs + gst_outputs_att * 0 else: gst_outputs = self.gst_layer(style_input) embedded_gst = gst_outputs.repeat(1, inputs.size(1), 1) From bdf69446653fe84e6fd69fb91c2bf99adfd7efff Mon Sep 17 00:00:00 2001 From: SanjaESC Date: Sun, 12 Jul 2020 10:40:33 +0200 Subject: [PATCH 03/56] fix fft_size key error --- mozilla_voice_tts/tts/models/tacotron_abstract.py | 1 + tests/inputs/test_config.json | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/mozilla_voice_tts/tts/models/tacotron_abstract.py b/mozilla_voice_tts/tts/models/tacotron_abstract.py index 13c3e948..d1148be5 100644 --- a/mozilla_voice_tts/tts/models/tacotron_abstract.py +++ b/mozilla_voice_tts/tts/models/tacotron_abstract.py @@ -177,6 +177,7 @@ class TacotronAbstract(ABC, nn.Module): elif style_input is None: gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) else: + # pylint: disable=not-callable gst_outputs = self.gst_layer(style_input) embedded_gst = gst_outputs.repeat(1, inputs.size(1), 1) return inputs, embedded_gst diff --git a/tests/inputs/test_config.json b/tests/inputs/test_config.json index 6da13bfc..b34a53a8 100644 --- a/tests/inputs/test_config.json +++ b/tests/inputs/test_config.json @@ -2,7 +2,7 @@ "audio":{ "audio_processor": "audio", // to use dictate different audio processors, if available. "num_mels": 80, // size of the mel spec frame. - "num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame. + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. "sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled. "frame_length_ms": null, // stft window length in ms. "frame_shift_ms": null, // stft window hop-lengh in ms. @@ -51,5 +51,15 @@ "output_path": "result", "min_seq_len": 0, "max_seq_len": 300, - "log_dir": "tests/outputs/" + "log_dir": "tests/outputs/", + + "use_speaker_embedding": false, + "use_gst": false, + "gst": { + "gst_style_input": null, + "gst_embedding_dim": 512, + "gst_num_heads": 4, + "gst_style_tokens": 10 + }, + } From eb51d5409af0b70f48cfb817241159d691fd81d6 Mon Sep 17 00:00:00 2001 From: SanjaESC Date: Sun, 12 Jul 2020 12:33:13 +0200 Subject: [PATCH 04/56] pylint --- mozilla_voice_tts/tts/models/tacotron_abstract.py | 3 +-- tests/inputs/test_config.json | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/mozilla_voice_tts/tts/models/tacotron_abstract.py b/mozilla_voice_tts/tts/models/tacotron_abstract.py index d1148be5..9b2ef148 100644 --- a/mozilla_voice_tts/tts/models/tacotron_abstract.py +++ b/mozilla_voice_tts/tts/models/tacotron_abstract.py @@ -177,8 +177,7 @@ class TacotronAbstract(ABC, nn.Module): elif style_input is None: gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) else: - # pylint: disable=not-callable - gst_outputs = self.gst_layer(style_input) + gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable embedded_gst = gst_outputs.repeat(1, inputs.size(1), 1) return inputs, embedded_gst diff --git a/tests/inputs/test_config.json b/tests/inputs/test_config.json index b34a53a8..450cb23a 100644 --- a/tests/inputs/test_config.json +++ b/tests/inputs/test_config.json @@ -60,6 +60,5 @@ "gst_embedding_dim": 512, "gst_num_heads": 4, "gst_style_tokens": 10 - }, - + } } From e206ff8a28f28beed5b44be363025f5cdd4ecd87 Mon Sep 17 00:00:00 2001 From: SanjaESC Date: Mon, 13 Jul 2020 08:50:39 +0200 Subject: [PATCH 05/56] override compute_gst in tacotron2 model --- mozilla_voice_tts/tts/models/tacotron2.py | 25 ++++++++++++++++++- .../tts/models/tacotron_abstract.py | 22 +++++----------- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/mozilla_voice_tts/tts/models/tacotron2.py b/mozilla_voice_tts/tts/models/tacotron2.py index 52da2f39..59e44fb2 100644 --- a/mozilla_voice_tts/tts/models/tacotron2.py +++ b/mozilla_voice_tts/tts/models/tacotron2.py @@ -47,17 +47,22 @@ class Tacotron2(TacotronAbstract): decoder_in_features = 512+speaker_embedding_dim+gst_embedding_dim encoder_in_features = 512 if num_speakers > 1 else 512 proj_speaker_dim = 80 if num_speakers > 1 else 0 - # base layers + + # embedding layer self.embedding = nn.Embedding(num_chars, 512, padding_idx=0) + + # speaker embedding layer if num_speakers > 1: self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) + self.encoder = Encoder(encoder_in_features) self.decoder = Decoder(decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, separate_stopnet, proj_speaker_dim) self.postnet = Postnet(self.postnet_output_dim) + # global style token layers if self.gst: self.gst_layer = GST(num_mel=80, @@ -81,6 +86,24 @@ class Tacotron2(TacotronAbstract): mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) return mel_outputs, mel_outputs_postnet, alignments + def compute_gst(self, inputs, style_input): + """ Compute global style token """ + device = inputs.device + if isinstance(style_input, dict): + query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device) + _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) + gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) + for k_token, v_amplifier in style_input.items(): + key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) + gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) + gst_outputs = gst_outputs + gst_outputs_att * v_amplifier + elif style_input is None: + gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) + else: + gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable + embedded_gst = gst_outputs.repeat(1, inputs.size(1), 1) + return inputs, embedded_gst + def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None): # compute mask for padding # B x T_in_max (boolean) diff --git a/mozilla_voice_tts/tts/models/tacotron_abstract.py b/mozilla_voice_tts/tts/models/tacotron_abstract.py index 9b2ef148..a4b8c227 100644 --- a/mozilla_voice_tts/tts/models/tacotron_abstract.py +++ b/mozilla_voice_tts/tts/models/tacotron_abstract.py @@ -164,22 +164,12 @@ class TacotronAbstract(ABC, nn.Module): self.speaker_embeddings_projected = self.speaker_project_mel( self.speaker_embeddings).squeeze(1) - def compute_gst(self, inputs, style_input): - device = inputs.device - if isinstance(style_input, dict): - query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device) - _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) - gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) - for k_token, v_amplifier in style_input.items(): - key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) - gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) - gst_outputs = gst_outputs + gst_outputs_att * v_amplifier - elif style_input is None: - gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) - else: - gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable - embedded_gst = gst_outputs.repeat(1, inputs.size(1), 1) - return inputs, embedded_gst + def compute_gst(self, inputs, mel_specs): + """ Compute global style token """ + # pylint: disable=not-callable + gst_outputs = self.gst_layer(mel_specs) + inputs = self._add_speaker_embedding(inputs, gst_outputs) + return inputs @staticmethod def _add_speaker_embedding(outputs, speaker_embeddings): From b0c4a77aea67b0ea93a4c2419f2e2c4dba518577 Mon Sep 17 00:00:00 2001 From: SanjaESC Date: Mon, 13 Jul 2020 08:51:37 +0200 Subject: [PATCH 06/56] small gst config change --- tests/test_tacotron2_model.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_tacotron2_model.py b/tests/test_tacotron2_model.py index 2faccd75..a0c5e59a 100644 --- a/tests/test_tacotron2_model.py +++ b/tests/test_tacotron2_model.py @@ -20,7 +20,12 @@ c = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) class TacotronTrainTest(unittest.TestCase): +<<<<<<< HEAD def test_train_step(self): # pylint: disable=no-self-use +======= + @staticmethod + def test_train_step(): +>>>>>>> small gst config change input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8, )).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] From 571f5761c950008f343c59282455ba1c5fc292e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 13 Jul 2020 10:33:55 +0200 Subject: [PATCH 07/56] update comment --- mozilla_voice_tts/tts/utils/text/cleaners.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mozilla_voice_tts/tts/utils/text/cleaners.py b/mozilla_voice_tts/tts/utils/text/cleaners.py index dd329f9c..6d1ace08 100644 --- a/mozilla_voice_tts/tts/utils/text/cleaners.py +++ b/mozilla_voice_tts/tts/utils/text/cleaners.py @@ -92,7 +92,7 @@ def transliteration_cleaners(text): def basic_german_cleaners(text): - '''Pipeline for Turkish text''' + '''Pipeline for German text''' text = lowercase(text) text = collapse_whitespace(text) return text From 93a9cc4683316eafbe6d9a3e951bd13938527fec Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 27 Jul 2020 16:59:59 -0300 Subject: [PATCH 08/56] add support fot VCTK and BRSpeech dataset --- mozilla_voice_tts/tts/datasets/preprocess.py | 35 ++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/mozilla_voice_tts/tts/datasets/preprocess.py b/mozilla_voice_tts/tts/datasets/preprocess.py index c3cf34e5..2ad414fb 100644 --- a/mozilla_voice_tts/tts/datasets/preprocess.py +++ b/mozilla_voice_tts/tts/datasets/preprocess.py @@ -205,3 +205,38 @@ def custom_turkish(root_path, meta_file): items.append([text, wav_file, speaker_name]) print(f" [!] {len(skipped_files)} files skipped. They don't exist...") return items + +# ToDo: add the dataset link when the dataset is released publicly +def brspeech(root_path, meta_file): + '''BRSpeech 3.0 beta''' + txt_file = os.path.join(root_path, meta_file) + items = [] + with open(txt_file, 'r') as ttf: + for line in ttf: + if line.startswith("wav_filename"): + continue + cols = line.split('|') + #print(cols) + wav_file = os.path.join(root_path, cols[0]) + text = cols[2] + speaker_name = cols[3] + items.append([text, wav_file, speaker_name]) + return items + +def vctk(root_path, meta_files=None, wavs_path='wav48'): + """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" + test_speakers = meta_files + items = [] + meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) + for meta_file in meta_files: + txt, speaker_id, txt_file = os.path.relpath(meta_file,root_path).split(os.sep) + file_id = txt_file.split('.')[0] + if isinstance(test_speakers, list): # if is list ignore this speakers ids + if speaker_id in test_speakers: + continue + with open(meta_file) as file_text: + text = file_text.readlines()[0] + wav_file = os.path.join(root_path, wavs_path, speaker_id,file_id+'.wav') + items.append([text, wav_file, speaker_id]) + + return items \ No newline at end of file From b7504527828eb3e2377b932fe6dd5bcf1aa50a58 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 27 Jul 2020 17:20:51 -0300 Subject: [PATCH 09/56] add Portuguese Cleaner --- mozilla_voice_tts/tts/utils/text/cleaners.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/mozilla_voice_tts/tts/utils/text/cleaners.py b/mozilla_voice_tts/tts/utils/text/cleaners.py index 6d1ace08..227118e6 100644 --- a/mozilla_voice_tts/tts/utils/text/cleaners.py +++ b/mozilla_voice_tts/tts/utils/text/cleaners.py @@ -67,15 +67,16 @@ def remove_aux_symbols(text): text = re.sub(r'[\<\>\(\)\[\]\"]+', '', text) return text - -def replace_symbols(text): +def replace_symbols(text, lang='en'): text = text.replace(';', ',') text = text.replace('-', ' ') - text = text.replace(':', ',') - text = text.replace('&', 'and') + text = text.replace(':', ' ') + if lang == 'en': + text = text.replace('&', 'and') + elif lang == 'pt': + text = text.replace('&', ' e ') return text - def basic_cleaners(text): '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' text = lowercase(text) @@ -118,6 +119,14 @@ def english_cleaners(text): text = collapse_whitespace(text) return text +def portuguese_cleaners(text): + '''Basic pipeline for Portuguese text. There is no need to expand abbreviation and + numbers, phonemizer already does that''' + text = lowercase(text) + text = replace_symbols(text, lang='pt') + text = remove_aux_symbols(text) + text = collapse_whitespace(text) + return text def phoneme_cleaners(text): '''Pipeline for phonemes mode, including number and abbreviation expansion.''' From e265810e8c430be60e3775a41fff4de70e4ecb1e Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 28 Jul 2020 17:11:32 -0300 Subject: [PATCH 10/56] bugfix in DDC now DDC work on Tacotron1 --- mozilla_voice_tts/tts/configs/config.json | 16 +++--- mozilla_voice_tts/tts/datasets/TTSDataset.py | 2 +- mozilla_voice_tts/tts/layers/tacotron.py | 7 ++- mozilla_voice_tts/tts/layers/tacotron2.py | 4 +- mozilla_voice_tts/tts/models/tacotron.py | 24 ++++----- mozilla_voice_tts/tts/models/tacotron2.py | 49 ++++--------------- .../tts/models/tacotron_abstract.py | 19 +++++-- mozilla_voice_tts/tts/utils/generic_utils.py | 8 ++- mozilla_voice_tts/tts/utils/text/cleaners.py | 1 - mozilla_voice_tts/utils/generic_utils.py | 9 +++- 10 files changed, 65 insertions(+), 74 deletions(-) diff --git a/mozilla_voice_tts/tts/configs/config.json b/mozilla_voice_tts/tts/configs/config.json index 8f56816e..154e1961 100644 --- a/mozilla_voice_tts/tts/configs/config.json +++ b/mozilla_voice_tts/tts/configs/config.json @@ -1,5 +1,5 @@ { - "model": "Tacotron2", + "model": "Tacotron", "run_name": "ljspeech-ddc-bn", "run_description": "tacotron2 with ddc and batch-normalization", @@ -114,7 +114,7 @@ "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING - "text_cleaner": "phoneme_cleaners", + "text_cleaner": "portuguese_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_val_loader_workers": 4, // number of evaluation data loader processes. @@ -123,15 +123,15 @@ "max_seq_len": 153, // DATASET-RELATED: maximum text length // PATHS - "output_path": "/home/erogol/Models/LJSpeech/", + "output_path": "../../Mozilla-TTS/vctk-test/", // PHONEMES - "phoneme_cache_path": "/media/erogol/data_ssd2/mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder. + "phoneme_cache_path": "../../Mozilla-TTS/vctk-test/", // phoneme computation is slow, therefore, it caches results in the given folder. "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages // MULTI-SPEAKER and GST - "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + "use_speaker_embedding": true, // use speaker embedding to enable multi-speaker learning. "use_gst": true, // use global style tokens "gst": { // gst parameter if gst is enabled "gst_style_input": null, // Condition the style input either on a @@ -147,9 +147,9 @@ "datasets": // List of datasets. They all merged and they get different speaker_ids. [ { - "name": "ljspeech", - "path": "/home/erogol/Data/LJSpeech-1.1/", - "meta_file_train": "metadata.csv", + "name": "vctk", + "path": "../../../datasets/VCTK-Corpus-removed-silence/", + "meta_file_train": ["p225", "p234", "p238", "p245", "p248", "p261", "p294", "p302", "p326", "p335", "p347"], // for vtck if list, ignore speakers id in list for train, its useful for test cloning with new speakers "meta_file_val": null } ] diff --git a/mozilla_voice_tts/tts/datasets/TTSDataset.py b/mozilla_voice_tts/tts/datasets/TTSDataset.py index ac524e55..cc6b72b5 100644 --- a/mozilla_voice_tts/tts/datasets/TTSDataset.py +++ b/mozilla_voice_tts/tts/datasets/TTSDataset.py @@ -70,7 +70,7 @@ class MyDataset(Dataset): self.sort_items() def load_wav(self, filename): - audio = self.ap.load_wav(filename) + audio = self.ap.load_wav(filename, sr=self.sample_rate) return audio @staticmethod diff --git a/mozilla_voice_tts/tts/layers/tacotron.py b/mozilla_voice_tts/tts/layers/tacotron.py index 2fc9e86a..bbeee95f 100644 --- a/mozilla_voice_tts/tts/layers/tacotron.py +++ b/mozilla_voice_tts/tts/layers/tacotron.py @@ -303,7 +303,7 @@ class Decoder(nn.Module): self.separate_stopnet = separate_stopnet self.query_dim = 256 # memory -> |Prenet| -> processed_memory - prenet_dim = frame_channels * self.memory_size + speaker_embedding_dim if self.use_memory_queue else frame_channels + speaker_embedding_dim + prenet_dim = memory_dim * self.memory_size if self.use_memory_queue else memory_dim self.prenet = Prenet( prenet_dim, prenet_type, @@ -429,7 +429,7 @@ class Decoder(nn.Module): # assert new_memory.shape[-1] == self.r * self.frame_channels self.memory_input = new_memory[:, self.frame_channels * (self.r - 1):] - def forward(self, inputs, memory, mask, speaker_embeddings=None): + def forward(self, inputs, memory, mask): """ Args: inputs: Encoder outputs. @@ -454,8 +454,7 @@ class Decoder(nn.Module): if t > 0: new_memory = memory[t - 1] self._update_memory_input(new_memory) - if speaker_embeddings is not None: - self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1) + output, stop_token, attention = self.decode(inputs, mask) outputs += [output] attentions += [attention] diff --git a/mozilla_voice_tts/tts/layers/tacotron2.py b/mozilla_voice_tts/tts/layers/tacotron2.py index 395a10ea..7c0dd443 100644 --- a/mozilla_voice_tts/tts/layers/tacotron2.py +++ b/mozilla_voice_tts/tts/layers/tacotron2.py @@ -300,7 +300,7 @@ class Decoder(nn.Module): decoder_output = decoder_output[:, :self.r * self.frame_channels] return decoder_output, self.attention.attention_weights, stop_token - def forward(self, inputs, memories, mask, speaker_embeddings=None): + def forward(self, inputs, memories, mask): r"""Train Decoder with teacher forcing. Args: inputs: Encoder outputs. @@ -318,8 +318,6 @@ class Decoder(nn.Module): memories = self._reshape_memory(memories) memories = torch.cat((memory, memories), dim=0) memories = self._update_memory(memories) - if speaker_embeddings is not None: - memories = torch.cat([memories, speaker_embeddings], dim=-1) memories = self.prenet(memories) self._init_states(inputs, mask=mask) diff --git a/mozilla_voice_tts/tts/models/tacotron.py b/mozilla_voice_tts/tts/models/tacotron.py index 6e0ba09b..8eda83b3 100644 --- a/mozilla_voice_tts/tts/models/tacotron.py +++ b/mozilla_voice_tts/tts/models/tacotron.py @@ -6,7 +6,6 @@ from mozilla_voice_tts.tts.layers.gst_layers import GST from mozilla_voice_tts.tts.layers.tacotron import Decoder, Encoder, PostCBHG from mozilla_voice_tts.tts.models.tacotron_abstract import TacotronAbstract - class Tacotron(TacotronAbstract): def __init__(self, num_chars, @@ -41,10 +40,19 @@ class Tacotron(TacotronAbstract): location_attn, attn_K, separate_stopnet, bidirectional_decoder, double_decoder_consistency, ddc_r, gst) - decoder_in_features = 512 if num_speakers > 1 else 256 - encoder_in_features = 512 if num_speakers > 1 else 256 + + + # init layer dims + decoder_in_features = 256 + encoder_in_features = 256 speaker_embedding_dim = 256 proj_speaker_dim = 80 if num_speakers > 1 else 0 + + if num_speakers > 1: + decoder_in_features = decoder_in_features + speaker_embedding_dim # add speaker embedding dim + if self.gst: + decoder_in_features = decoder_in_features + gst_embedding_dim # add gst embedding dim + # base model layers self.embedding = nn.Embedding(num_chars, 256, padding_idx=0) self.embedding.weight.data.normal_(0, 0.3) @@ -98,10 +106,6 @@ class Tacotron(TacotronAbstract): # B x speaker_embed_dim if speaker_ids is not None: self.compute_speaker_embedding(speaker_ids) - if self.num_speakers > 1: - # B x T_in x embed_dim + speaker_embed_dim - inputs = self._concat_speaker_embedding(inputs, - self.speaker_embeddings) # B x T_in x encoder_in_features encoder_outputs = self.encoder(inputs) # sequence masking @@ -117,8 +121,7 @@ class Tacotron(TacotronAbstract): # alignments: B x T_in x encoder_in_features # stop_tokens: B x T_in decoder_outputs, alignments, stop_tokens = self.decoder( - encoder_outputs, mel_specs, input_mask, - self.speaker_embeddings_projected) + encoder_outputs, mel_specs, input_mask) # sequence masking if output_mask is not None: decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs) @@ -145,9 +148,6 @@ class Tacotron(TacotronAbstract): self._init_states() if speaker_ids is not None: self.compute_speaker_embedding(speaker_ids) - if self.num_speakers > 1: - inputs = self._concat_speaker_embedding(inputs, - self.speaker_embeddings) encoder_outputs = self.encoder(inputs) if self.gst and style_mel is not None: encoder_outputs = self.compute_gst(encoder_outputs, style_mel) diff --git a/mozilla_voice_tts/tts/models/tacotron2.py b/mozilla_voice_tts/tts/models/tacotron2.py index 59e44fb2..944138bc 100644 --- a/mozilla_voice_tts/tts/models/tacotron2.py +++ b/mozilla_voice_tts/tts/models/tacotron2.py @@ -5,7 +5,6 @@ from mozilla_voice_tts.tts.layers.gst_layers import GST from mozilla_voice_tts.tts.layers.tacotron2 import Decoder, Encoder, Postnet from mozilla_voice_tts.tts.models.tacotron_abstract import TacotronAbstract - # TODO: match function arguments with tacotron class Tacotron2(TacotronAbstract): def __init__(self, @@ -86,24 +85,6 @@ class Tacotron2(TacotronAbstract): mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) return mel_outputs, mel_outputs_postnet, alignments - def compute_gst(self, inputs, style_input): - """ Compute global style token """ - device = inputs.device - if isinstance(style_input, dict): - query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device) - _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) - gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) - for k_token, v_amplifier in style_input.items(): - key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) - gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) - gst_outputs = gst_outputs + gst_outputs_att * v_amplifier - elif style_input is None: - gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) - else: - gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable - embedded_gst = gst_outputs.repeat(1, inputs.size(1), 1) - return inputs, embedded_gst - def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None): # compute mask for padding # B x T_in_max (boolean) @@ -113,20 +94,13 @@ class Tacotron2(TacotronAbstract): # B x T_in_max x D_en encoder_outputs = self.encoder(embedded_inputs, text_lengths) + if self.gst: + # B x gst_dim + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs) + if self.num_speakers > 1: embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] - embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) - if hasattr(self, 'gst'): - # B x gst_dim - encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, mel_specs) - encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1) - else: - encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) - else: - if hasattr(self, 'gst'): - # B x gst_dim - encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, mel_specs) - encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) @@ -163,15 +137,14 @@ class Tacotron2(TacotronAbstract): embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) if hasattr(self, 'gst'): # B x gst_dim - encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) - encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1) + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) + encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) else: encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) else: if hasattr(self, 'gst'): # B x gst_dim - encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) - encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1) + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) decoder_outputs, alignments, stop_tokens = self.decoder.inference( encoder_outputs) @@ -193,15 +166,13 @@ class Tacotron2(TacotronAbstract): embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) if hasattr(self, 'gst'): # B x gst_dim - encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) - encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1) + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) else: encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) else: if hasattr(self, 'gst'): # B x gst_dim - encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) - encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1) + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated( encoder_outputs) diff --git a/mozilla_voice_tts/tts/models/tacotron_abstract.py b/mozilla_voice_tts/tts/models/tacotron_abstract.py index a4b8c227..6f3d32ad 100644 --- a/mozilla_voice_tts/tts/models/tacotron_abstract.py +++ b/mozilla_voice_tts/tts/models/tacotron_abstract.py @@ -164,11 +164,22 @@ class TacotronAbstract(ABC, nn.Module): self.speaker_embeddings_projected = self.speaker_project_mel( self.speaker_embeddings).squeeze(1) - def compute_gst(self, inputs, mel_specs): + def compute_gst(self, inputs, style_input): """ Compute global style token """ - # pylint: disable=not-callable - gst_outputs = self.gst_layer(mel_specs) - inputs = self._add_speaker_embedding(inputs, gst_outputs) + device = inputs.device + if isinstance(style_input, dict): + query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device) + _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) + gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) + for k_token, v_amplifier in style_input.items(): + key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) + gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) + gst_outputs = gst_outputs + gst_outputs_att * v_amplifier + elif style_input is None: + gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) + else: + gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable + inputs = self._concat_speaker_embedding(inputs, gst_outputs) return inputs @staticmethod diff --git a/mozilla_voice_tts/tts/utils/generic_utils.py b/mozilla_voice_tts/tts/utils/generic_utils.py index 212379a3..9c3c618d 100644 --- a/mozilla_voice_tts/tts/utils/generic_utils.py +++ b/mozilla_voice_tts/tts/utils/generic_utils.py @@ -257,10 +257,16 @@ def check_config(c): check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=1) check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1) + check_argument('gst', c, restricted=True, val_type=dict) + check_argument('gst_style_input', c['gst'], restricted=True, val_type=[str, dict]) + check_argument('gst_embedding_dim', c['gst'], restricted=True, val_type=int, min_val=0, max_val=1000) + check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=2, max_val=10) + check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1, max_val=1000) + # datasets - checking only the first entry check_argument('datasets', c, restricted=True, val_type=list) for dataset_entry in c['datasets']: check_argument('name', dataset_entry, restricted=True, val_type=str) check_argument('path', dataset_entry, restricted=True, val_type=str) - check_argument('meta_file_train', dataset_entry, restricted=True, val_type=str) + check_argument('meta_file_train', dataset_entry, restricted=True, val_type=[str, list]) check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str) diff --git a/mozilla_voice_tts/tts/utils/text/cleaners.py b/mozilla_voice_tts/tts/utils/text/cleaners.py index 227118e6..b1930834 100644 --- a/mozilla_voice_tts/tts/utils/text/cleaners.py +++ b/mozilla_voice_tts/tts/utils/text/cleaners.py @@ -107,7 +107,6 @@ def basic_turkish_cleaners(text): text = collapse_whitespace(text) return text - def english_cleaners(text): '''Pipeline for English text, including number and abbreviation expansion.''' text = convert_to_ascii(text) diff --git a/mozilla_voice_tts/utils/generic_utils.py b/mozilla_voice_tts/utils/generic_utils.py index 478b4358..add5120d 100644 --- a/mozilla_voice_tts/utils/generic_utils.py +++ b/mozilla_voice_tts/utils/generic_utils.py @@ -146,5 +146,12 @@ def check_argument(name, c, enum_list=None, max_val=None, min_val=None, restrict assert c[name] >= min_val, f' [!] {name} is smaller than min value {min_val}' if enum_list: assert c[name].lower() in enum_list, f' [!] {name} is not a valid value' - if val_type: + if isinstance(val_type, list): + valid_types = val_type + is_valid = False + for typ in val_type: + if isinstance(c[name], typ): + is_valid = True + assert is_valid or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}' + elif val_type: assert isinstance(c[name], val_type) or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}' From 8a1c113df6403b0aae6d951fec8624643953e018 Mon Sep 17 00:00:00 2001 From: Edresson Date: Wed, 29 Jul 2020 00:49:00 -0300 Subject: [PATCH 11/56] add External Embedding per sample instead of nn.Embedding --- mozilla_voice_tts/bin/train_tts.py | 89 +++++++++++++------- mozilla_voice_tts/tts/configs/config.json | 8 +- mozilla_voice_tts/tts/datasets/TTSDataset.py | 18 +++- mozilla_voice_tts/tts/layers/tacotron.py | 11 +-- mozilla_voice_tts/tts/layers/tacotron2.py | 9 +- mozilla_voice_tts/tts/models/tacotron.py | 75 ++++++++++------- mozilla_voice_tts/tts/models/tacotron2.py | 84 ++++++++++-------- mozilla_voice_tts/tts/utils/generic_utils.py | 16 ++-- mozilla_voice_tts/tts/utils/speakers.py | 7 +- 9 files changed, 190 insertions(+), 127 deletions(-) diff --git a/mozilla_voice_tts/bin/train_tts.py b/mozilla_voice_tts/bin/train_tts.py index 719b926f..0642f290 100644 --- a/mozilla_voice_tts/bin/train_tts.py +++ b/mozilla_voice_tts/bin/train_tts.py @@ -49,7 +49,7 @@ from mozilla_voice_tts.utils.training import (NoamLR, adam_weight_decay, use_cuda, num_gpus = setup_torch_training_env(True, False) -def setup_loader(ap, r, is_val=False, verbose=False): +def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None): if is_val and not c.run_eval: loader = None else: @@ -68,7 +68,8 @@ def setup_loader(ap, r, is_val=False, verbose=False): use_phonemes=c.use_phonemes, phoneme_language=c.phoneme_language, enable_eos_bos=c.enable_eos_bos_chars, - verbose=verbose) + verbose=verbose, + speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None) sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader( dataset, @@ -82,9 +83,8 @@ def setup_loader(ap, r, is_val=False, verbose=False): pin_memory=False) return loader - -def format_data(data): - if c.use_speaker_embedding: +def format_data(data, speaker_mapping=None): + if speaker_mapping is None and c.use_speaker_embedding and not c.use_external_speaker_embedding_file: speaker_mapping = load_speaker_mapping(OUT_PATH) # setup input data @@ -99,13 +99,20 @@ def format_data(data): avg_spec_length = torch.mean(mel_lengths.float()) if c.use_speaker_embedding: - speaker_ids = [ - speaker_mapping[speaker_name] for speaker_name in speaker_names - ] - speaker_ids = torch.LongTensor(speaker_ids) + if c.use_external_speaker_embedding_file: + speaker_embeddings = data[8] + speaker_ids = None + else: + speaker_ids = [ + speaker_mapping[speaker_name] for speaker_name in speaker_names + ] + speaker_ids = torch.LongTensor(speaker_ids) + speaker_embeddings = None else: + speaker_embeddings = None speaker_ids = None + # set stop targets view, we predict a single stop token per iteration. stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) @@ -122,13 +129,16 @@ def format_data(data): stop_targets = stop_targets.cuda(non_blocking=True) if speaker_ids is not None: speaker_ids = speaker_ids.cuda(non_blocking=True) - return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length + if speaker_embeddings is not None: + speaker_embeddings = speaker_embeddings.cuda(non_blocking=True) + + return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, avg_text_length, avg_spec_length def train(model, criterion, optimizer, optimizer_st, scheduler, - ap, global_step, epoch, amp): + ap, global_step, epoch, amp, speaker_mapping=None): data_loader = setup_loader(ap, model.decoder.r, is_val=False, - verbose=(epoch == 0)) + verbose=(epoch == 0), speaker_mapping=speaker_mapping) model.train() epoch_time = 0 keep_avg = KeepAverage() @@ -143,7 +153,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, start_time = time.time() # format data - text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length = format_data(data) + text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, avg_text_length, avg_spec_length = format_data(data, speaker_mapping) loader_time = time.time() - end_time global_step += 1 @@ -158,10 +168,10 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, # forward pass model if c.bidirectional_decoder or c.double_decoder_consistency: decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model( - text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids) + text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings) else: decoder_output, postnet_output, alignments, stop_tokens = model( - text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids) + text_input, text_lengths, mel_input, mel_lengths, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings) decoder_backward_output = None alignments_backward = None @@ -312,8 +322,8 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, @torch.no_grad() -def evaluate(model, criterion, ap, global_step, epoch): - data_loader = setup_loader(ap, model.decoder.r, is_val=True) +def evaluate(model, criterion, ap, global_step, epoch, speaker_mapping=None): + data_loader = setup_loader(ap, model.decoder.r, is_val=True, speaker_mapping=speaker_mapping) model.eval() epoch_time = 0 keep_avg = KeepAverage() @@ -323,16 +333,16 @@ def evaluate(model, criterion, ap, global_step, epoch): start_time = time.time() # format data - text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data(data) + text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, _, _ = format_data(data, speaker_mapping) assert mel_input.shape[1] % model.decoder.r == 0 # forward pass model if c.bidirectional_decoder or c.double_decoder_consistency: decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model( - text_input, text_lengths, mel_input, speaker_ids=speaker_ids) + text_input, text_lengths, mel_input, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings) else: decoder_output, postnet_output, alignments, stop_tokens = model( - text_input, text_lengths, mel_input, speaker_ids=speaker_ids) + text_input, text_lengths, mel_input, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings) decoder_backward_output = None alignments_backward = None @@ -494,22 +504,41 @@ def main(args): # pylint: disable=redefined-outer-name if c.use_speaker_embedding: speakers = get_speakers(meta_data_train) if args.restore_path: - prev_out_path = os.path.dirname(args.restore_path) - speaker_mapping = load_speaker_mapping(prev_out_path) - assert all([speaker in speaker_mapping - for speaker in speakers]), "As of now you, you cannot " \ - "introduce new speakers to " \ - "a previously trained model." - else: + if c.use_external_speaker_embedding_file: # if restore checkpoint and use External Embedding file + prev_out_path = os.path.dirname(args.restore_path) + speaker_mapping = load_speaker_mapping(prev_out_path) + if not speaker_mapping: + print("WARNING: speakers.json speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file") + speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) + if not speaker_mapping: + raise RuntimeError("You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file") + speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']) + elif not c.use_external_speaker_embedding_file: # if restore checkpoint and don't use External Embedding file + prev_out_path = os.path.dirname(args.restore_path) + speaker_mapping = load_speaker_mapping(prev_out_path) + speaker_embedding_dim = None + assert all([speaker in speaker_mapping + for speaker in speakers]), "As of now you, you cannot " \ + "introduce new speakers to " \ + "a previously trained model." + elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # if start new train using External Embedding file + speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) + print(speaker_mapping) + speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']) + elif c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file: # if start new train using External Embedding file and don't pass external embedding file + raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" + else: # if start new train and don't use External Embedding file speaker_mapping = {name: i for i, name in enumerate(speakers)} + speaker_embedding_dim = None save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) print("Training with {} speakers: {}".format(num_speakers, ", ".join(speakers))) else: num_speakers = 0 + speaker_embedding_dim = None - model = setup_model(num_chars, num_speakers, c) + model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim) params = set_weight_decay(model, c.wd) optimizer = RAdam(params, lr=c.lr, weight_decay=0) @@ -530,6 +559,8 @@ def main(args): # pylint: disable=redefined-outer-name # setup criterion criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) + for name, _ in model.named_parameters(): + print(name) if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') @@ -592,7 +623,7 @@ def main(args): # pylint: disable=redefined-outer-name print("\n > Number of output frames:", model.decoder.r) train_avg_loss_dict, global_step = train(model, criterion, optimizer, optimizer_st, scheduler, ap, - global_step, epoch, amp) + global_step, epoch, amp, speaker_mapping) eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_postnet_loss'] diff --git a/mozilla_voice_tts/tts/configs/config.json b/mozilla_voice_tts/tts/configs/config.json index 154e1961..090540ab 100644 --- a/mozilla_voice_tts/tts/configs/config.json +++ b/mozilla_voice_tts/tts/configs/config.json @@ -1,5 +1,5 @@ { - "model": "Tacotron", + "model": "Tacotron2", "run_name": "ljspeech-ddc-bn", "run_description": "tacotron2 with ddc and batch-normalization", @@ -114,7 +114,7 @@ "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING - "text_cleaner": "portuguese_cleaners", + "text_cleaner": "phoneme_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_val_loader_workers": 4, // number of evaluation data loader processes. @@ -131,7 +131,9 @@ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages // MULTI-SPEAKER and GST - "use_speaker_embedding": true, // use speaker embedding to enable multi-speaker learning. + "use_speaker_embedding": true, // use speaker embedding to enable multi-speaker learning. + "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 "use_gst": true, // use global style tokens "gst": { // gst parameter if gst is enabled "gst_style_input": null, // Condition the style input either on a diff --git a/mozilla_voice_tts/tts/datasets/TTSDataset.py b/mozilla_voice_tts/tts/datasets/TTSDataset.py index cc6b72b5..1002a292 100644 --- a/mozilla_voice_tts/tts/datasets/TTSDataset.py +++ b/mozilla_voice_tts/tts/datasets/TTSDataset.py @@ -24,6 +24,7 @@ class MyDataset(Dataset): phoneme_cache_path=None, phoneme_language="en-us", enable_eos_bos=False, + speaker_mapping=None, verbose=False): """ Args: @@ -58,6 +59,7 @@ class MyDataset(Dataset): self.phoneme_cache_path = phoneme_cache_path self.phoneme_language = phoneme_language self.enable_eos_bos = enable_eos_bos + self.speaker_mapping = speaker_mapping self.verbose = verbose if use_phonemes and not os.path.isdir(phoneme_cache_path): os.makedirs(phoneme_cache_path, exist_ok=True) @@ -127,7 +129,8 @@ class MyDataset(Dataset): 'text': text, 'wav': wav, 'item_idx': self.items[idx][1], - 'speaker_name': speaker_name + 'speaker_name': speaker_name, + 'wav_file_name': os.path.basename(wav_file) } return sample @@ -191,9 +194,15 @@ class MyDataset(Dataset): batch[idx]['item_idx'] for idx in ids_sorted_decreasing ] text = [batch[idx]['text'] for idx in ids_sorted_decreasing] + speaker_name = [batch[idx]['speaker_name'] for idx in ids_sorted_decreasing] - + # get speaker embeddings + if self.speaker_mapping is not None: + wav_files_names = [batch[idx]['wav_file_name'] for idx in ids_sorted_decreasing] + speaker_embedding = [self.speaker_mapping[w]['embedding'] for w in wav_files_names] + else: + speaker_embedding = None # compute features mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] @@ -224,6 +233,9 @@ class MyDataset(Dataset): mel_lengths = torch.LongTensor(mel_lengths) stop_targets = torch.FloatTensor(stop_targets) + if speaker_embedding is not None: + speaker_embedding = torch.FloatTensor(speaker_embedding) + # compute linear spectrogram if self.compute_linear_spec: linear = [self.ap.spectrogram(w).astype('float32') for w in wav] @@ -234,7 +246,7 @@ class MyDataset(Dataset): else: linear = None return text, text_lenghts, speaker_name, linear, mel, mel_lengths, \ - stop_targets, item_idxs + stop_targets, item_idxs, speaker_embedding raise TypeError(("batch must contain tensors, numbers, dicts or lists;\ found {}".format(type(batch[0])))) diff --git a/mozilla_voice_tts/tts/layers/tacotron.py b/mozilla_voice_tts/tts/layers/tacotron.py index bbeee95f..9dcebd0f 100644 --- a/mozilla_voice_tts/tts/layers/tacotron.py +++ b/mozilla_voice_tts/tts/layers/tacotron.py @@ -291,7 +291,7 @@ class Decoder(nn.Module): def __init__(self, in_channels, frame_channels, r, memory_size, attn_type, attn_windowing, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, - separate_stopnet, speaker_embedding_dim): + separate_stopnet): super(Decoder, self).__init__() self.r_init = r self.r = r @@ -462,15 +462,12 @@ class Decoder(nn.Module): t += 1 return self._parse_outputs(outputs, attentions, stop_tokens) - def inference(self, inputs, speaker_embeddings=None): + def inference(self, inputs): """ Args: inputs: encoder outputs. - speaker_embeddings: speaker vectors. - Shapes: - - inputs: (B, T, D_out_enc) - - speaker_embeddings: (B, D_embed) + - inputs: batch x time x encoder_out_dim """ outputs = [] attentions = [] @@ -483,8 +480,6 @@ class Decoder(nn.Module): if t > 0: new_memory = outputs[-1] self._update_memory_input(new_memory) - if speaker_embeddings is not None: - self.memory_input = torch.cat([self.memory_input, speaker_embeddings], dim=-1) output, stop_token, attention = self.decode(inputs, None) stop_token = torch.sigmoid(stop_token.data) outputs += [output] diff --git a/mozilla_voice_tts/tts/layers/tacotron2.py b/mozilla_voice_tts/tts/layers/tacotron2.py index 7c0dd443..5d6ced25 100644 --- a/mozilla_voice_tts/tts/layers/tacotron2.py +++ b/mozilla_voice_tts/tts/layers/tacotron2.py @@ -147,8 +147,7 @@ class Decoder(nn.Module): #pylint: disable=attribute-defined-outside-init def __init__(self, in_channels, frame_channels, r, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, - forward_attn_mask, location_attn, attn_K, separate_stopnet, - speaker_embedding_dim): + forward_attn_mask, location_attn, attn_K, separate_stopnet): super(Decoder, self).__init__() self.frame_channels = frame_channels self.r_init = r @@ -335,16 +334,14 @@ class Decoder(nn.Module): outputs, stop_tokens, alignments) return outputs, alignments, stop_tokens - def inference(self, inputs, speaker_embeddings=None): + def inference(self, inputs): r"""Decoder inference without teacher forcing and use Stopnet to stop decoder. Args: inputs: Encoder outputs. - speaker_embeddings: speaker embedding vectors. Shapes: - inputs: (B, T, D_out_enc) - - speaker_embeddings: (B, D_embed) - outputs: (B, T_mel, D_mel) - alignments: (B, T_in, T_out) - stop_tokens: (B, T_out) @@ -358,8 +355,6 @@ class Decoder(nn.Module): outputs, stop_tokens, alignments, t = [], [], [], 0 while True: memory = self.prenet(memory) - if speaker_embeddings is not None: - memory = torch.cat([memory, speaker_embeddings], dim=-1) decoder_output, alignment, stop_token = self.decode(memory) stop_token = torch.sigmoid(stop_token.data) outputs += [decoder_output.squeeze(1)] diff --git a/mozilla_voice_tts/tts/models/tacotron.py b/mozilla_voice_tts/tts/models/tacotron.py index 8eda83b3..9dfdbf63 100644 --- a/mozilla_voice_tts/tts/models/tacotron.py +++ b/mozilla_voice_tts/tts/models/tacotron.py @@ -27,6 +27,7 @@ class Tacotron(TacotronAbstract): bidirectional_decoder=False, double_decoder_consistency=False, ddc_r=None, + speaker_embedding_dim=None, gst=False, gst_embedding_dim=256, gst_num_heads=4, @@ -40,39 +41,46 @@ class Tacotron(TacotronAbstract): location_attn, attn_K, separate_stopnet, bidirectional_decoder, double_decoder_consistency, ddc_r, gst) - - + # init layer dims decoder_in_features = 256 encoder_in_features = 256 - speaker_embedding_dim = 256 - proj_speaker_dim = 80 if num_speakers > 1 else 0 + if speaker_embedding_dim is None: + # if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim + self.embeddings_per_sample = False + speaker_embedding_dim = 256 + else: + # if speaker_embedding_dim is not None we need use speaker embedding per sample + self.embeddings_per_sample = True + + # speaker and gst embeddings is concat in decoder input if num_speakers > 1: decoder_in_features = decoder_in_features + speaker_embedding_dim # add speaker embedding dim if self.gst: decoder_in_features = decoder_in_features + gst_embedding_dim # add gst embedding dim - # base model layers + # embedding layer self.embedding = nn.Embedding(num_chars, 256, padding_idx=0) + + # speaker embedding layers + if num_speakers > 1: + if not self.embeddings_per_sample: + self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim) + self.speaker_embedding.weight.data.normal_(0, 0.3) + + # base model layers self.embedding.weight.data.normal_(0, 0.3) self.encoder = Encoder(encoder_in_features) self.decoder = Decoder(decoder_in_features, decoder_output_dim, r, memory_size, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, - attn_K, separate_stopnet, proj_speaker_dim) + attn_K, separate_stopnet) self.postnet = PostCBHG(decoder_output_dim) self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, postnet_output_dim) - # speaker embedding layers - if num_speakers > 1: - self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim) - self.speaker_embedding.weight.data.normal_(0, 0.3) - self.speaker_project_mel = nn.Sequential( - nn.Linear(speaker_embedding_dim, proj_speaker_dim), nn.Tanh()) - self.speaker_embeddings = None - self.speaker_embeddings_projected = None + # global style token layers if self.gst: self.gst_layer = GST(num_mel=80, @@ -88,10 +96,9 @@ class Tacotron(TacotronAbstract): decoder_in_features, decoder_output_dim, ddc_r, memory_size, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, - attn_K, separate_stopnet, proj_speaker_dim) + attn_K, separate_stopnet) - - def forward(self, characters, text_lengths, mel_specs, mel_lengths=None, speaker_ids=None): + def forward(self, characters, text_lengths, mel_specs, mel_lengths=None, speaker_ids=None, speaker_embeddings=None): """ Shapes: - characters: B x T_in @@ -99,24 +106,27 @@ class Tacotron(TacotronAbstract): - mel_specs: B x T_out x D - speaker_ids: B x 1 """ - self._init_states() input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) # B x T_in x embed_dim inputs = self.embedding(characters) - # B x speaker_embed_dim - if speaker_ids is not None: - self.compute_speaker_embedding(speaker_ids) # B x T_in x encoder_in_features encoder_outputs = self.encoder(inputs) # sequence masking encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) + # global style token if self.gst: # B x gst_dim encoder_outputs = self.compute_gst(encoder_outputs, mel_specs) + # speaker embedding if self.num_speakers > 1: - encoder_outputs = self._concat_speaker_embedding( - encoder_outputs, self.speaker_embeddings) + if not self.embeddings_per_sample: + # B x 1 x speaker_embed_dim + speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + else: + # B x 1 x speaker_embed_dim + speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) # decoder_outputs: B x decoder_in_features x T_out # alignments: B x T_in x encoder_in_features # stop_tokens: B x T_in @@ -143,19 +153,22 @@ class Tacotron(TacotronAbstract): return decoder_outputs, postnet_outputs, alignments, stop_tokens @torch.no_grad() - def inference(self, characters, speaker_ids=None, style_mel=None): + def inference(self, characters, speaker_ids=None, style_mel=None, speaker_embeddings=None): inputs = self.embedding(characters) - self._init_states() - if speaker_ids is not None: - self.compute_speaker_embedding(speaker_ids) encoder_outputs = self.encoder(inputs) - if self.gst and style_mel is not None: + if self.gst: + # B x gst_dim encoder_outputs = self.compute_gst(encoder_outputs, style_mel) if self.num_speakers > 1: - encoder_outputs = self._concat_speaker_embedding( - encoder_outputs, self.speaker_embeddings) + if not self.embeddings_per_sample: + # B x 1 x speaker_embed_dim + speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + else: + # B x 1 x speaker_embed_dim + speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) decoder_outputs, alignments, stop_tokens = self.decoder.inference( - encoder_outputs, self.speaker_embeddings_projected) + encoder_outputs) postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = self.last_linear(postnet_outputs) decoder_outputs = decoder_outputs.transpose(1, 2) diff --git a/mozilla_voice_tts/tts/models/tacotron2.py b/mozilla_voice_tts/tts/models/tacotron2.py index 944138bc..45e743e5 100644 --- a/mozilla_voice_tts/tts/models/tacotron2.py +++ b/mozilla_voice_tts/tts/models/tacotron2.py @@ -27,6 +27,7 @@ class Tacotron2(TacotronAbstract): bidirectional_decoder=False, double_decoder_consistency=False, ddc_r=None, + speaker_embedding_dim=None, gst=False, gst_embedding_dim=512, gst_num_heads=4, @@ -41,25 +42,38 @@ class Tacotron2(TacotronAbstract): ddc_r, gst) # init layer dims - speaker_embedding_dim = 512 if num_speakers > 1 else 0 - gst_embedding_dim = gst_embedding_dim if self.gst else 0 - decoder_in_features = 512+speaker_embedding_dim+gst_embedding_dim - encoder_in_features = 512 if num_speakers > 1 else 512 - proj_speaker_dim = 80 if num_speakers > 1 else 0 + decoder_in_features = 512 + encoder_in_features = 512 + + if speaker_embedding_dim is None: + # if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim + self.embeddings_per_sample = False + speaker_embedding_dim = 512 + else: + # if speaker_embedding_dim is not None we need use speaker embedding per sample + self.embeddings_per_sample = True + + # speaker and gst embeddings is concat in decoder input + if num_speakers > 1: + decoder_in_features = decoder_in_features + speaker_embedding_dim # add speaker embedding dim + if self.gst: + decoder_in_features = decoder_in_features + gst_embedding_dim # add gst embedding dim # embedding layer self.embedding = nn.Embedding(num_chars, 512, padding_idx=0) # speaker embedding layer if num_speakers > 1: - self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim) - self.speaker_embedding.weight.data.normal_(0, 0.3) + if not self.embeddings_per_sample: + self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim) + self.speaker_embedding.weight.data.normal_(0, 0.3) + # base model layers self.encoder = Encoder(encoder_in_features) self.decoder = Decoder(decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, - location_attn, attn_K, separate_stopnet, proj_speaker_dim) + location_attn, attn_K, separate_stopnet) self.postnet = Postnet(self.postnet_output_dim) # global style token layers @@ -77,7 +91,7 @@ class Tacotron2(TacotronAbstract): decoder_in_features, self.decoder_output_dim, ddc_r, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, - separate_stopnet, proj_speaker_dim) + separate_stopnet) @staticmethod def shape_outputs(mel_outputs, mel_outputs_postnet, alignments): @@ -85,7 +99,7 @@ class Tacotron2(TacotronAbstract): mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) return mel_outputs, mel_outputs_postnet, alignments - def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None): + def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None, speaker_embeddings=None): # compute mask for padding # B x T_in_max (boolean) input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) @@ -99,8 +113,13 @@ class Tacotron2(TacotronAbstract): encoder_outputs = self.compute_gst(encoder_outputs, mel_specs) if self.num_speakers > 1: - embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) + if not self.embeddings_per_sample: + # B x 1 x speaker_embed_dim + speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + else: + # B x 1 x speaker_embed_dim + speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) @@ -128,23 +147,18 @@ class Tacotron2(TacotronAbstract): return decoder_outputs, postnet_outputs, alignments, stop_tokens @torch.no_grad() - def inference(self, text, speaker_ids=None, style_mel=None): + def inference(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None): embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder.inference(embedded_inputs) + if self.gst: + # B x gst_dim + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) + if self.num_speakers > 1: - embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] - embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) - if hasattr(self, 'gst'): - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel) - encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) - else: - encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) - else: - if hasattr(self, 'gst'): - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel) + if not self.embeddings_per_sample: + speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) decoder_outputs, alignments, stop_tokens = self.decoder.inference( encoder_outputs) @@ -154,25 +168,21 @@ class Tacotron2(TacotronAbstract): decoder_outputs, postnet_outputs, alignments) return decoder_outputs, postnet_outputs, alignments, stop_tokens - def inference_truncated(self, text, speaker_ids=None, style_mel=None): + def inference_truncated(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None): """ Preserve model states for continuous inference """ embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder.inference_truncated(embedded_inputs) + if self.gst: + # B x gst_dim + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) + if self.num_speakers > 1: - embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] - embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) - if hasattr(self, 'gst'): - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel) - else: - encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) - else: - if hasattr(self, 'gst'): - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel) + if not self.embeddings_per_sample: + speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated( encoder_outputs) diff --git a/mozilla_voice_tts/tts/utils/generic_utils.py b/mozilla_voice_tts/tts/utils/generic_utils.py index 9c3c618d..2d5044ef 100644 --- a/mozilla_voice_tts/tts/utils/generic_utils.py +++ b/mozilla_voice_tts/tts/utils/generic_utils.py @@ -44,7 +44,7 @@ def sequence_mask(sequence_length, max_len=None): return seq_range_expand < seq_length_expand -def setup_model(num_chars, num_speakers, c): +def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): print(" > Using model: {}".format(c.model)) MyModel = importlib.import_module('mozilla_voice_tts.tts.models.' + c.model.lower()) MyModel = getattr(MyModel, c.model) @@ -72,7 +72,8 @@ def setup_model(num_chars, num_speakers, c): separate_stopnet=c.separate_stopnet, bidirectional_decoder=c.bidirectional_decoder, double_decoder_consistency=c.double_decoder_consistency, - ddc_r=c.ddc_r) + ddc_r=c.ddc_r, + speaker_embedding_dim=speaker_embedding_dim) elif c.model.lower() == "tacotron2": model = MyModel(num_chars=num_chars, num_speakers=num_speakers, @@ -96,7 +97,8 @@ def setup_model(num_chars, num_speakers, c): separate_stopnet=c.separate_stopnet, bidirectional_decoder=c.bidirectional_decoder, double_decoder_consistency=c.double_decoder_consistency, - ddc_r=c.ddc_r) + ddc_r=c.ddc_r, + speaker_embedding_dim=speaker_embedding_dim) return model @@ -175,7 +177,7 @@ def check_config(c): check_argument('clip_norm', c['audio'], restricted=True, val_type=bool) check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000) check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0) - check_argument('spec_gain', c['audio'], restricted=True, val_type=float, min_val=1, max_val=100) + check_argument('spec_gain', c['audio'], restricted=True, val_type=[int, float], min_val=1, max_val=100) check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool) check_argument('trim_db', c['audio'], restricted=True, val_type=int) @@ -246,10 +248,10 @@ def check_config(c): # paths check_argument('output_path', c, restricted=True, val_type=str) - # multi-speaker + # multi-speaker and gst check_argument('use_speaker_embedding', c, restricted=True, val_type=bool) - - # GST + check_argument('use_external_speaker_embedding_file', c, restricted=True, val_type=bool) + check_argument('external_speaker_embedding_file', c, restricted=True, val_type=str) check_argument('use_gst', c, restricted=True, val_type=bool) check_argument('gst_style_input', c, restricted=True, val_type=str) check_argument('gst', c, restricted=True, val_type=dict) diff --git a/mozilla_voice_tts/tts/utils/speakers.py b/mozilla_voice_tts/tts/utils/speakers.py index ff624b36..156e42af 100644 --- a/mozilla_voice_tts/tts/utils/speakers.py +++ b/mozilla_voice_tts/tts/utils/speakers.py @@ -10,12 +10,15 @@ def make_speakers_json_path(out_path): def load_speaker_mapping(out_path): """Loads speaker mapping if already present.""" try: - with open(make_speakers_json_path(out_path)) as f: + if os.path.splitext(out_path)[1] == '.json': + json_file = out_path + else: + json_file = make_speakers_json_path(out_path) + with open(json_file) as f: return json.load(f) except FileNotFoundError: return {} - def save_speaker_mapping(out_path, speaker_mapping): """Saves speaker mapping if not yet present.""" speakers_json_path = make_speakers_json_path(out_path) From 1d73566e4e2fe9a4f5e799b80f8506a0f92768e3 Mon Sep 17 00:00:00 2001 From: Edresson Date: Wed, 29 Jul 2020 18:22:13 -0300 Subject: [PATCH 12/56] bugfix in GST --- mozilla_voice_tts/bin/train_tts.py | 6 +++--- mozilla_voice_tts/tts/layers/gst_layers.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mozilla_voice_tts/bin/train_tts.py b/mozilla_voice_tts/bin/train_tts.py index 0642f290..daa517b9 100644 --- a/mozilla_voice_tts/bin/train_tts.py +++ b/mozilla_voice_tts/bin/train_tts.py @@ -508,7 +508,7 @@ def main(args): # pylint: disable=redefined-outer-name prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) if not speaker_mapping: - print("WARNING: speakers.json speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file") + print("WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file") speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) if not speaker_mapping: raise RuntimeError("You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file") @@ -559,8 +559,6 @@ def main(args): # pylint: disable=redefined-outer-name # setup criterion criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) - for name, _ in model.named_parameters(): - print(name) if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') @@ -575,6 +573,8 @@ def main(args): # pylint: disable=redefined-outer-name print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint['model'], c) + # torch.save(model_dict, os.path.join(OUT_PATH, 'state_dict.pt')) + # print("State Dict saved for debug in: ", os.path.join(OUT_PATH, 'state_dict.pt')) model.load_state_dict(model_dict) del model_dict diff --git a/mozilla_voice_tts/tts/layers/gst_layers.py b/mozilla_voice_tts/tts/layers/gst_layers.py index 01f90697..a49b14a2 100644 --- a/mozilla_voice_tts/tts/layers/gst_layers.py +++ b/mozilla_voice_tts/tts/layers/gst_layers.py @@ -96,7 +96,7 @@ class StyleTokenLayer(nn.Module): self.key_dim = embedding_dim // num_heads self.style_tokens = nn.Parameter( torch.FloatTensor(num_style_tokens, self.key_dim)) - nn.init.orthogonal_(self.style_tokens) + nn.init.normal_(self.style_tokens, mean=0, std=0.5) self.attention = MultiHeadAttention( query_dim=self.query_dim, key_dim=self.key_dim, From 7c12e94ee4c922c79397b2abd496b5103f8cb83d Mon Sep 17 00:00:00 2001 From: Edresson Date: Wed, 29 Jul 2020 21:21:42 -0300 Subject: [PATCH 13/56] fix Lint check --- mozilla_voice_tts/tts/datasets/TTSDataset.py | 2 +- mozilla_voice_tts/tts/datasets/preprocess.py | 4 ++-- mozilla_voice_tts/tts/models/tacotron.py | 2 +- mozilla_voice_tts/tts/utils/text/cleaners.py | 2 +- mozilla_voice_tts/utils/generic_utils.py | 1 - 5 files changed, 5 insertions(+), 6 deletions(-) diff --git a/mozilla_voice_tts/tts/datasets/TTSDataset.py b/mozilla_voice_tts/tts/datasets/TTSDataset.py index 1002a292..2ef78e11 100644 --- a/mozilla_voice_tts/tts/datasets/TTSDataset.py +++ b/mozilla_voice_tts/tts/datasets/TTSDataset.py @@ -199,7 +199,7 @@ class MyDataset(Dataset): for idx in ids_sorted_decreasing] # get speaker embeddings if self.speaker_mapping is not None: - wav_files_names = [batch[idx]['wav_file_name'] for idx in ids_sorted_decreasing] + wav_files_names = [batch[idx]['wav_file_name'] for idx in ids_sorted_decreasing] speaker_embedding = [self.speaker_mapping[w]['embedding'] for w in wav_files_names] else: speaker_embedding = None diff --git a/mozilla_voice_tts/tts/datasets/preprocess.py b/mozilla_voice_tts/tts/datasets/preprocess.py index 2ad414fb..317673e3 100644 --- a/mozilla_voice_tts/tts/datasets/preprocess.py +++ b/mozilla_voice_tts/tts/datasets/preprocess.py @@ -229,14 +229,14 @@ def vctk(root_path, meta_files=None, wavs_path='wav48'): items = [] meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) for meta_file in meta_files: - txt, speaker_id, txt_file = os.path.relpath(meta_file,root_path).split(os.sep) + _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) file_id = txt_file.split('.')[0] if isinstance(test_speakers, list): # if is list ignore this speakers ids if speaker_id in test_speakers: continue with open(meta_file) as file_text: text = file_text.readlines()[0] - wav_file = os.path.join(root_path, wavs_path, speaker_id,file_id+'.wav') + wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id+'.wav') items.append([text, wav_file, speaker_id]) return items \ No newline at end of file diff --git a/mozilla_voice_tts/tts/models/tacotron.py b/mozilla_voice_tts/tts/models/tacotron.py index 9dfdbf63..f6bd07ed 100644 --- a/mozilla_voice_tts/tts/models/tacotron.py +++ b/mozilla_voice_tts/tts/models/tacotron.py @@ -55,7 +55,7 @@ class Tacotron(TacotronAbstract): self.embeddings_per_sample = True # speaker and gst embeddings is concat in decoder input - if num_speakers > 1: + if num_speakers > 1: decoder_in_features = decoder_in_features + speaker_embedding_dim # add speaker embedding dim if self.gst: decoder_in_features = decoder_in_features + gst_embedding_dim # add gst embedding dim diff --git a/mozilla_voice_tts/tts/utils/text/cleaners.py b/mozilla_voice_tts/tts/utils/text/cleaners.py index b1930834..a36ebe67 100644 --- a/mozilla_voice_tts/tts/utils/text/cleaners.py +++ b/mozilla_voice_tts/tts/utils/text/cleaners.py @@ -119,7 +119,7 @@ def english_cleaners(text): return text def portuguese_cleaners(text): - '''Basic pipeline for Portuguese text. There is no need to expand abbreviation and + '''Basic pipeline for Portuguese text. There is no need to expand abbreviation and numbers, phonemizer already does that''' text = lowercase(text) text = replace_symbols(text, lang='pt') diff --git a/mozilla_voice_tts/utils/generic_utils.py b/mozilla_voice_tts/utils/generic_utils.py index add5120d..dcfbbdc3 100644 --- a/mozilla_voice_tts/utils/generic_utils.py +++ b/mozilla_voice_tts/utils/generic_utils.py @@ -147,7 +147,6 @@ def check_argument(name, c, enum_list=None, max_val=None, min_val=None, restrict if enum_list: assert c[name].lower() in enum_list, f' [!] {name} is not a valid value' if isinstance(val_type, list): - valid_types = val_type is_valid = False for typ in val_type: if isinstance(c[name], typ): From def7e49f5986ed8955a0befb3daddecb0bdb4392 Mon Sep 17 00:00:00 2001 From: Edresson Date: Wed, 29 Jul 2020 23:37:51 -0300 Subject: [PATCH 14/56] travis unit tests fix and add Tacotron and Tacotron 2 GST and MultiSpeaker Tests --- tests/inputs/test_config.json | 15 +++++ tests/test_layers.py | 35 +--------- tests/test_tacotron2_model.py | 119 ++++++++++++++++++++++++++++++++-- tests/test_tacotron_model.py | 79 +++++++++++++++++++++- 4 files changed, 206 insertions(+), 42 deletions(-) diff --git a/tests/inputs/test_config.json b/tests/inputs/test_config.json index 450cb23a..b1e857c0 100644 --- a/tests/inputs/test_config.json +++ b/tests/inputs/test_config.json @@ -53,6 +53,7 @@ "max_seq_len": 300, "log_dir": "tests/outputs/", +<<<<<<< HEAD "use_speaker_embedding": false, "use_gst": false, "gst": { @@ -61,4 +62,18 @@ "gst_num_heads": 4, "gst_style_tokens": 10 } +======= + // MULTI-SPEAKER and GST + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + "use_gst": true, // use global style tokens + "gst": { // gst parameter if gst is enabled + "gst_style_input": null, // Condition the style input either on a + // -> wave file [path to wave] or + // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} + // with the dictionary being len(dict) <= len(gst_style_tokens). + "gst_embedding_dim": 512, + "gst_num_heads": 4, + "gst_style_tokens": 10 + } +>>>>>>> travis unit tests fix and add Tacotron and Tacotron 2 GST and MultiSpeaker Tests } diff --git a/tests/test_layers.py b/tests/test_layers.py index bf036f5c..0b5315c5 100644 --- a/tests/test_layers.py +++ b/tests/test_layers.py @@ -58,8 +58,7 @@ class DecoderTests(unittest.TestCase): trans_agent=True, forward_attn_mask=True, location_attn=True, - separate_stopnet=True, - speaker_embedding_dim=0) + separate_stopnet=True) dummy_input = T.rand(4, 8, 256) dummy_memory = T.rand(4, 2, 80) @@ -71,38 +70,6 @@ class DecoderTests(unittest.TestCase): assert output.shape[2] == 2, "size not {}".format(output.shape[2]) assert stop_tokens.shape[0] == 4 - @staticmethod - def test_in_out_multispeaker(): - layer = Decoder( - in_channels=256, - frame_channels=80, - r=2, - memory_size=4, - attn_windowing=False, - attn_norm="sigmoid", - attn_K=5, - attn_type="graves", - prenet_type='original', - prenet_dropout=True, - forward_attn=True, - trans_agent=True, - forward_attn_mask=True, - location_attn=True, - separate_stopnet=True, - speaker_embedding_dim=80) - dummy_input = T.rand(4, 8, 256) - dummy_memory = T.rand(4, 2, 80) - dummy_embed = T.rand(4, 80) - - output, alignment, stop_tokens = layer( - dummy_input, dummy_memory, mask=None, speaker_embeddings=dummy_embed) - - assert output.shape[0] == 4 - assert output.shape[1] == 80, "size not {}".format(output.shape[1]) - assert output.shape[2] == 2, "size not {}".format(output.shape[2]) - assert stop_tokens.shape[0] == 4 - - class EncoderTests(unittest.TestCase): def test_in_out(self): #pylint: disable=no-self-use layer = Encoder(128) diff --git a/tests/test_tacotron2_model.py b/tests/test_tacotron2_model.py index a0c5e59a..d4d5eb86 100644 --- a/tests/test_tacotron2_model.py +++ b/tests/test_tacotron2_model.py @@ -9,6 +9,7 @@ from torch import nn, optim from mozilla_voice_tts.tts.layers.losses import MSELossMasked from mozilla_voice_tts.tts.models.tacotron2 import Tacotron2 from mozilla_voice_tts.utils.io import load_config +from mozilla_voice_tts.utils.audio import AudioProcessor #pylint: disable=unused-variable @@ -18,14 +19,12 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") c = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) +ap = AudioProcessor(**c.audio) +WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") + class TacotronTrainTest(unittest.TestCase): -<<<<<<< HEAD def test_train_step(self): # pylint: disable=no-self-use -======= - @staticmethod - def test_train_step(): ->>>>>>> small gst config change input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8, )).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] @@ -75,3 +74,113 @@ class TacotronTrainTest(unittest.TestCase): ), "param {} with shape {} not updated!! \n{}\n{}".format( count, param.shape, param, param_ref) count += 1 + +class TacotronGSTTrainTest(unittest.TestCase): + @staticmethod + def test_train_step(): + # with random gst mel style + input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) + input_lengths = torch.randint(100, 128, (8, )).long().to(device) + input_lengths = torch.sort(input_lengths, descending=True)[0] + mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) + mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) + mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + mel_lengths[0] = 30 + stop_targets = torch.zeros(8, 30, 1).float().to(device) + speaker_ids = torch.randint(0, 5, (8, )).long().to(device) + + for idx in mel_lengths: + stop_targets[:, int(idx.item()):, 0] = 1.0 + + stop_targets = stop_targets.view(input_dummy.shape[0], + stop_targets.size(1) // c.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() + + criterion = MSELossMasked(seq_len_norm=False).to(device) + criterion_st = nn.BCEWithLogitsLoss().to(device) + model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, gst=True, gst_embedding_dim=c.gst['gst_embedding_dim'], gst_num_heads=c.gst['gst_num_heads'], gst_style_tokens=c.gst['gst_style_tokens']).to(device) + model.train() + model_ref = copy.deepcopy(model) + count = 0 + for param, param_ref in zip(model.parameters(), model_ref.parameters()): + assert (param - param_ref).sum() == 0, param + count += 1 + optimizer = optim.Adam(model.parameters(), lr=c.lr) + for i in range(10): + mel_out, mel_postnet_out, align, stop_tokens = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids) + assert torch.sigmoid(stop_tokens).data.max() <= 1.0 + assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + optimizer.zero_grad() + loss = criterion(mel_out, mel_spec, mel_lengths) + stop_loss = criterion_st(stop_tokens, stop_targets) + loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss.backward() + optimizer.step() + # check parameter changes + count = 0 + for name_param, param_ref in zip(model.named_parameters(), model_ref.parameters()): + # ignore pre-higway layer since it works conditional + # if count not in [145, 59]: + name, param = name_param + if name == 'gst_layer.encoder.recurrence.weight_hh_l0': + #print(param.grad) + continue + assert (param != param_ref).any( + ), "param {} {} with shape {} not updated!! \n{}\n{}".format( + name, count, param.shape, param, param_ref) + count += 1 + + # with file gst style + mel_spec = torch.FloatTensor(ap.melspectrogram(ap.load_wav(WAV_FILE)))[:, :30].unsqueeze(0).transpose(1, 2).to(device) + mel_spec = mel_spec.repeat(8, 1, 1) + input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) + input_lengths = torch.randint(100, 128, (8, )).long().to(device) + input_lengths = torch.sort(input_lengths, descending=True)[0] + mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) + mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + mel_lengths[0] = 30 + stop_targets = torch.zeros(8, 30, 1).float().to(device) + speaker_ids = torch.randint(0, 5, (8, )).long().to(device) + + for idx in mel_lengths: + stop_targets[:, int(idx.item()):, 0] = 1.0 + + stop_targets = stop_targets.view(input_dummy.shape[0], + stop_targets.size(1) // c.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() + + criterion = MSELossMasked(seq_len_norm=False).to(device) + criterion_st = nn.BCEWithLogitsLoss().to(device) + model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, gst=True, gst_embedding_dim=c.gst['gst_embedding_dim'], gst_num_heads=c.gst['gst_num_heads'], gst_style_tokens=c.gst['gst_style_tokens']).to(device) + model.train() + model_ref = copy.deepcopy(model) + count = 0 + for param, param_ref in zip(model.parameters(), model_ref.parameters()): + assert (param - param_ref).sum() == 0, param + count += 1 + optimizer = optim.Adam(model.parameters(), lr=c.lr) + for i in range(10): + mel_out, mel_postnet_out, align, stop_tokens = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids) + assert torch.sigmoid(stop_tokens).data.max() <= 1.0 + assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + optimizer.zero_grad() + loss = criterion(mel_out, mel_spec, mel_lengths) + stop_loss = criterion_st(stop_tokens, stop_targets) + loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss.backward() + optimizer.step() + # check parameter changes + count = 0 + for name_param, param_ref in zip(model.named_parameters(), model_ref.parameters()): + # ignore pre-higway layer since it works conditional + # if count not in [145, 59]: + name, param = name_param + if name == 'gst_layer.encoder.recurrence.weight_hh_l0': + #print(param.grad) + continue + assert (param != param_ref).any( + ), "param {} {} with shape {} not updated!! \n{}\n{}".format( + name, count, param.shape, param, param_ref) + count += 1 \ No newline at end of file diff --git a/tests/test_tacotron_model.py b/tests/test_tacotron_model.py index d15a6705..42880589 100644 --- a/tests/test_tacotron_model.py +++ b/tests/test_tacotron_model.py @@ -9,6 +9,7 @@ from torch import nn, optim from mozilla_voice_tts.tts.layers.losses import L1LossMasked from mozilla_voice_tts.tts.models.tacotron import Tacotron from mozilla_voice_tts.utils.io import load_config +from mozilla_voice_tts.utils.audio import AudioProcessor #pylint: disable=unused-variable @@ -18,6 +19,9 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") c = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) +ap = AudioProcessor(**c.audio) +WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") + def count_parameters(model): r"""Count number of trainable parameters in a network""" @@ -85,10 +89,10 @@ class TacotronTrainTest(unittest.TestCase): count, param.shape, param, param_ref) count += 1 - class TacotronGSTTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + # with random gst mel style input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8, )).long().to(device) input_lengths[-1] = 128 @@ -113,13 +117,82 @@ class TacotronGSTTrainTest(unittest.TestCase): num_chars=32, num_speakers=5, gst=True, - postnet_output_dim=c.audio['num_freq'], + gst_embedding_dim=c.gst['gst_embedding_dim'], + gst_num_heads=c.gst['gst_num_heads'], + gst_style_tokens=c.gst['gst_style_tokens'], + postnet_output_dim=c.audio['fft_size'], decoder_output_dim=c.audio['num_mels'], r=c.r, memory_size=c.memory_size ).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor model.train() - print(model) + # print(model) + print(" > Num parameters for Tacotron GST model:%s" % + (count_parameters(model))) + model_ref = copy.deepcopy(model) + count = 0 + for param, param_ref in zip(model.parameters(), + model_ref.parameters()): + assert (param - param_ref).sum() == 0, param + count += 1 + optimizer = optim.Adam(model.parameters(), lr=c.lr) + for _ in range(10): + mel_out, linear_out, align, stop_tokens = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids) + optimizer.zero_grad() + loss = criterion(mel_out, mel_spec, mel_lengths) + stop_loss = criterion_st(stop_tokens, stop_targets) + loss = loss + criterion(linear_out, linear_spec, + mel_lengths) + stop_loss + loss.backward() + optimizer.step() + # check parameter changes + count = 0 + for param, param_ref in zip(model.parameters(), + model_ref.parameters()): + # ignore pre-higway layer since it works conditional + assert (param != param_ref).any( + ), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref) + count += 1 + + # with file gst style + mel_spec = torch.FloatTensor(ap.melspectrogram(ap.load_wav(WAV_FILE)))[:, :120].unsqueeze(0).transpose(1, 2).to(device) + mel_spec = mel_spec.repeat(8, 1, 1) + + input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) + input_lengths = torch.randint(100, 129, (8, )).long().to(device) + input_lengths[-1] = 128 + linear_spec = torch.rand(8, mel_spec.size(1), c.audio['fft_size']).to(device) + mel_lengths = torch.randint(20, mel_spec.size(1), (8, )).long().to(device) + mel_lengths[-1] = mel_spec.size(1) + stop_targets = torch.zeros(8, mel_spec.size(1), 1).float().to(device) + speaker_ids = torch.randint(0, 5, (8, )).long().to(device) + + for idx in mel_lengths: + stop_targets[:, int(idx.item()):, 0] = 1.0 + + stop_targets = stop_targets.view(input_dummy.shape[0], + stop_targets.size(1) // c.r, -1) + stop_targets = (stop_targets.sum(2) > + 0.0).unsqueeze(2).float().squeeze() + + criterion = L1LossMasked(seq_len_norm=False).to(device) + criterion_st = nn.BCEWithLogitsLoss().to(device) + model = Tacotron( + num_chars=32, + num_speakers=5, + gst=True, + gst_embedding_dim=c.gst['gst_embedding_dim'], + gst_num_heads=c.gst['gst_num_heads'], + gst_style_tokens=c.gst['gst_style_tokens'], + postnet_output_dim=c.audio['fft_size'], + decoder_output_dim=c.audio['num_mels'], + r=c.r, + memory_size=c.memory_size + ).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor + model.train() + # print(model) print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) From 6e7f33c798ce5277916efe6f0047d6ac20db48cb Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 30 Jul 2020 03:51:20 -0300 Subject: [PATCH 15/56] add support for synthesize using variable size external embedding and add bugfix in scipy.io import --- mozilla_voice_tts/bin/synthesize.py | 39 +++++++++++++++++------- mozilla_voice_tts/bin/train_tts.py | 1 - mozilla_voice_tts/tts/utils/synthesis.py | 24 ++++++++++++--- 3 files changed, 47 insertions(+), 17 deletions(-) diff --git a/mozilla_voice_tts/bin/synthesize.py b/mozilla_voice_tts/bin/synthesize.py index b52db37e..527a3ce9 100644 --- a/mozilla_voice_tts/bin/synthesize.py +++ b/mozilla_voice_tts/bin/synthesize.py @@ -18,9 +18,9 @@ from mozilla_voice_tts.utils.io import load_config from mozilla_voice_tts.vocoder.utils.generic_utils import setup_generator -def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_id): +def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None): t_1 = time.time() - waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, CONFIG.gst['gst_style_input'], False, CONFIG.enable_eos_bos_chars, use_gl) + waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, None, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding) if CONFIG.model == "Tacotron" and not use_gl: mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T if not use_gl: @@ -80,9 +80,9 @@ if __name__ == "__main__": help="JSON file for multi-speaker model.", default="") parser.add_argument( - '--speaker_id', - type=int, - help="target speaker_id if the model is multi-speaker.", + '--speaker_fileid', + type=str, + help="if CONFIG.use_external_speaker_embedding_file is true, name of speaker embedding reference file present in speakers.json, else target speaker_fileid if the model is multi-speaker.", default=None) args = parser.parse_args() @@ -97,16 +97,24 @@ if __name__ == "__main__": if 'characters' in C.keys(): symbols, phonemes = make_symbols(**C.characters) + speaker_embedding = None + speaker_embedding_dim = None + num_speakers = 0 + # load speakers if args.speakers_json != '': - speakers = json.load(open(args.speakers_json, 'r')) - num_speakers = len(speakers) - else: - num_speakers = 0 + speaker_mapping = json.load(open(args.speakers_json, 'r')) + num_speakers = len(speaker_mapping) + if C.use_external_speaker_embedding_file: + if args.speaker_fileid is not None: + speaker_embedding = speaker_mapping[args.speaker_fileid]['embedding'] + else: # if speaker_fileid is not specificated use the first sample in speakers.json + speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[0]]['embedding'] + speaker_embedding_dim = len(speaker_embedding) # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) - model = setup_model(num_chars, num_speakers, C) + model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim) cp = torch.load(args.model_path, map_location=torch.device('cpu')) model.load_state_dict(cp['model']) model.eval() @@ -130,7 +138,16 @@ if __name__ == "__main__": # synthesize voice use_griffin_lim = args.vocoder_path == "" print(" > Text: {}".format(args.text)) - wav = tts(model, vocoder_model, args.text, C, args.use_cuda, ap, use_griffin_lim, args.speaker_id) + + if not C.use_external_speaker_embedding_file: + if args.speaker_fileid.isdigit(): + args.speaker_fileid = int(args.speaker_fileid) + else: + args.speaker_fileid = None + else: + args.speaker_fileid = None + + wav = tts(model, vocoder_model, args.text, C, args.use_cuda, ap, use_griffin_lim, args.speaker_fileid, speaker_embedding=speaker_embedding) # save the results file_name = args.text.replace(" ", "_") diff --git a/mozilla_voice_tts/bin/train_tts.py b/mozilla_voice_tts/bin/train_tts.py index daa517b9..1b9bc032 100644 --- a/mozilla_voice_tts/bin/train_tts.py +++ b/mozilla_voice_tts/bin/train_tts.py @@ -523,7 +523,6 @@ def main(args): # pylint: disable=redefined-outer-name "a previously trained model." elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # if start new train using External Embedding file speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) - print(speaker_mapping) speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']) elif c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file: # if start new train using External Embedding file and don't pass external embedding file raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" diff --git a/mozilla_voice_tts/tts/utils/synthesis.py b/mozilla_voice_tts/tts/utils/synthesis.py index 6fed8f89..2f746533 100644 --- a/mozilla_voice_tts/tts/utils/synthesis.py +++ b/mozilla_voice_tts/tts/utils/synthesis.py @@ -45,17 +45,17 @@ def compute_style_mel(style_wav, ap, cuda=False): return style_mel -def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None): +def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None, speaker_embeddings=None): if CONFIG.use_gst: decoder_output, postnet_output, alignments, stop_tokens = model.inference( - inputs, style_mel=style_mel, speaker_ids=speaker_id) + inputs, style_mel=style_mel, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings) else: if truncated: decoder_output, postnet_output, alignments, stop_tokens = model.inference_truncated( - inputs, speaker_ids=speaker_id) + inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings) else: decoder_output, postnet_output, alignments, stop_tokens = model.inference( - inputs, speaker_ids=speaker_id) + inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings) return decoder_output, postnet_output, alignments, stop_tokens @@ -140,6 +140,15 @@ def id_to_torch(speaker_id, cuda=False): return speaker_id +def embedding_to_torch(speaker_embedding, cuda=False): + if speaker_embedding is not None: + speaker_embedding = np.asarray(speaker_embedding) + speaker_embedding = torch.from_numpy(speaker_embedding).unsqueeze(0).type(torch.FloatTensor) + if cuda: + return speaker_embedding.cuda() + return speaker_embedding + + # TODO: perform GL with pytorch for batching def apply_griffin_lim(inputs, input_lens, CONFIG, ap): '''Apply griffin-lim to each sample iterating throught the first dimension. @@ -169,6 +178,7 @@ def synthesis(model, enable_eos_bos_chars=False, #pylint: disable=unused-argument use_griffin_lim=False, do_trim_silence=False, + speaker_embedding=None, backend='torch'): """Synthesize voice for the given text. @@ -200,6 +210,10 @@ def synthesis(model, if backend == 'torch': if speaker_id is not None: speaker_id = id_to_torch(speaker_id, cuda=use_cuda) + + if speaker_embedding is not None: + speaker_embedding = embedding_to_torch(speaker_embedding, cuda=use_cuda) + if not isinstance(style_mel, dict): style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) inputs = numpy_to_torch(inputs, torch.long, cuda=use_cuda) @@ -216,7 +230,7 @@ def synthesis(model, # synthesize voice if backend == 'torch': decoder_output, postnet_output, alignments, stop_tokens = run_model_torch( - model, inputs, CONFIG, truncated, speaker_id, style_mel) + model, inputs, CONFIG, truncated, speaker_id, style_mel, speaker_embeddings=speaker_embedding) postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_torch( postnet_output, decoder_output, alignments, stop_tokens) elif backend == 'tf': From 5c752799aea6b04c5c98a773a44bb971b95fccb5 Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 30 Jul 2020 11:48:12 +0200 Subject: [PATCH 16/56] linter update --- mozilla_voice_tts/tts/models/tacotron.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mozilla_voice_tts/tts/models/tacotron.py b/mozilla_voice_tts/tts/models/tacotron.py index f6bd07ed..682d2b59 100644 --- a/mozilla_voice_tts/tts/models/tacotron.py +++ b/mozilla_voice_tts/tts/models/tacotron.py @@ -113,7 +113,7 @@ class Tacotron(TacotronAbstract): encoder_outputs = self.encoder(inputs) # sequence masking encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) - + # global style token if self.gst: # B x gst_dim From fe081d4f7c5078aef1df677680f10e79f18d6dfc Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 5 Aug 2020 18:33:22 +0200 Subject: [PATCH 17/56] fixing rebase issues --- mozilla_voice_tts/tts/layers/tacotron.py | 2 +- mozilla_voice_tts/tts/layers/tacotron2.py | 2 -- tests/inputs/test_config.json | 23 ++++++----------------- tests/outputs/dummy_model_config.json | 15 ++++++++++++++- tests/test_tacotron_model.py | 6 +++--- 5 files changed, 24 insertions(+), 24 deletions(-) diff --git a/mozilla_voice_tts/tts/layers/tacotron.py b/mozilla_voice_tts/tts/layers/tacotron.py index 9dcebd0f..807282b3 100644 --- a/mozilla_voice_tts/tts/layers/tacotron.py +++ b/mozilla_voice_tts/tts/layers/tacotron.py @@ -303,7 +303,7 @@ class Decoder(nn.Module): self.separate_stopnet = separate_stopnet self.query_dim = 256 # memory -> |Prenet| -> processed_memory - prenet_dim = memory_dim * self.memory_size if self.use_memory_queue else memory_dim + prenet_dim = frame_channels * self.memory_size if self.use_memory_queue else frame_channels self.prenet = Prenet( prenet_dim, prenet_type, diff --git a/mozilla_voice_tts/tts/layers/tacotron2.py b/mozilla_voice_tts/tts/layers/tacotron2.py index 5d6ced25..490f3728 100644 --- a/mozilla_voice_tts/tts/layers/tacotron2.py +++ b/mozilla_voice_tts/tts/layers/tacotron2.py @@ -141,7 +141,6 @@ class Decoder(nn.Module): location_attn (bool): if true, use location sensitive attention. attn_K (int): number of attention heads for GravesAttention. separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow. - speaker_embedding_dim (int): size of speaker embedding vector, for multi-speaker training. """ # Pylint gets confused by PyTorch conventions here #pylint: disable=attribute-defined-outside-init @@ -156,7 +155,6 @@ class Decoder(nn.Module): self.separate_stopnet = separate_stopnet self.max_decoder_steps = 1000 self.stop_threshold = 0.5 - self.speaker_embedding_dim = speaker_embedding_dim # model dimensions self.query_dim = 1024 diff --git a/tests/inputs/test_config.json b/tests/inputs/test_config.json index b1e857c0..b2bba154 100644 --- a/tests/inputs/test_config.json +++ b/tests/inputs/test_config.json @@ -53,27 +53,16 @@ "max_seq_len": 300, "log_dir": "tests/outputs/", -<<<<<<< HEAD - "use_speaker_embedding": false, - "use_gst": false, - "gst": { - "gst_style_input": null, - "gst_embedding_dim": 512, - "gst_num_heads": 4, - "gst_style_tokens": 10 - } -======= // MULTI-SPEAKER and GST "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. "use_gst": true, // use global style tokens "gst": { // gst parameter if gst is enabled - "gst_style_input": null, // Condition the style input either on a - // -> wave file [path to wave] or - // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} + "gst_style_input": null, // Condition the style input either on a + // -> wave file [path to wave] or + // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} // with the dictionary being len(dict) <= len(gst_style_tokens). - "gst_embedding_dim": 512, + "gst_embedding_dim": 512, "gst_num_heads": 4, "gst_style_tokens": 10 - } ->>>>>>> travis unit tests fix and add Tacotron and Tacotron 2 GST and MultiSpeaker Tests - } + } +} diff --git a/tests/outputs/dummy_model_config.json b/tests/outputs/dummy_model_config.json index d2e2fca0..bf46be1c 100644 --- a/tests/outputs/dummy_model_config.json +++ b/tests/outputs/dummy_model_config.json @@ -83,6 +83,19 @@ "use_phonemes": false, // use phonemes instead of raw characters. It is suggested for better pronounciation. "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages "text_cleaner": "phoneme_cleaners", - "use_speaker_embedding": false // whether to use additional embeddings for separate speakers + "use_speaker_embedding": false, // whether to use additional embeddings for separate speakers + + // MULTI-SPEAKER and GST + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + "use_gst": true, // use global style tokens + "gst": { // gst parameter if gst is enabled + "gst_style_input": null, // Condition the style input either on a + // -> wave file [path to wave] or + // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} + // with the dictionary being len(dict) <= len(gst_style_tokens). + "gst_embedding_dim": 512, + "gst_num_heads": 4, + "gst_style_tokens": 10 + } } diff --git a/tests/test_tacotron_model.py b/tests/test_tacotron_model.py index 42880589..2b55cbac 100644 --- a/tests/test_tacotron_model.py +++ b/tests/test_tacotron_model.py @@ -35,7 +35,7 @@ class TacotronTrainTest(unittest.TestCase): input_lengths = torch.randint(100, 129, (8, )).long().to(device) input_lengths[-1] = 128 mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - linear_spec = torch.rand(8, 30, c.audio['num_freq']).to(device) + linear_spec = torch.rand(8, 30, c.audio['fft_size']).to(device) mel_lengths = torch.randint(20, 30, (8, )).long().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device) speaker_ids = torch.randint(0, 5, (8, )).long().to(device) @@ -53,7 +53,7 @@ class TacotronTrainTest(unittest.TestCase): model = Tacotron( num_chars=32, num_speakers=5, - postnet_output_dim=c.audio['num_freq'], + postnet_output_dim=c.audio['fft_size'], decoder_output_dim=c.audio['num_mels'], r=c.r, memory_size=c.memory_size @@ -97,7 +97,7 @@ class TacotronGSTTrainTest(unittest.TestCase): input_lengths = torch.randint(100, 129, (8, )).long().to(device) input_lengths[-1] = 128 mel_spec = torch.rand(8, 120, c.audio['num_mels']).to(device) - linear_spec = torch.rand(8, 120, c.audio['num_freq']).to(device) + linear_spec = torch.rand(8, 120, c.audio['fft_size']).to(device) mel_lengths = torch.randint(20, 120, (8, )).long().to(device) mel_lengths[-1] = 120 stop_targets = torch.zeros(8, 120, 1).float().to(device) From 84b7ab6ee67a214fdac26458806886c319d430ed Mon Sep 17 00:00:00 2001 From: SanjaESC Date: Fri, 10 Jul 2020 12:14:55 +0200 Subject: [PATCH 18/56] Added support for Tacotron2 GST + abbility to condition style input with wav or tokens --- mozilla_voice_tts/tts/models/tacotron2.py | 100 +++-- .../tts/models/tacotron_abstract.py | 13 +- mozilla_voice_tts/tts/utils/synthesis.py | 3 + synthesize.py | 182 +++++++++ tests/inputs/test_train_config.json | 159 ++++++++ utils/generic_utils.py | 374 ++++++++++++++++++ 6 files changed, 776 insertions(+), 55 deletions(-) create mode 100644 synthesize.py create mode 100644 utils/generic_utils.py diff --git a/mozilla_voice_tts/tts/models/tacotron2.py b/mozilla_voice_tts/tts/models/tacotron2.py index 45e743e5..7effdb3d 100644 --- a/mozilla_voice_tts/tts/models/tacotron2.py +++ b/mozilla_voice_tts/tts/models/tacotron2.py @@ -27,7 +27,6 @@ class Tacotron2(TacotronAbstract): bidirectional_decoder=False, double_decoder_consistency=False, ddc_r=None, - speaker_embedding_dim=None, gst=False, gst_embedding_dim=512, gst_num_heads=4, @@ -42,33 +41,18 @@ class Tacotron2(TacotronAbstract): ddc_r, gst) # init layer dims - decoder_in_features = 512 - encoder_in_features = 512 - - if speaker_embedding_dim is None: - # if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim - self.embeddings_per_sample = False - speaker_embedding_dim = 512 - else: - # if speaker_embedding_dim is not None we need use speaker embedding per sample - self.embeddings_per_sample = True - - # speaker and gst embeddings is concat in decoder input - if num_speakers > 1: - decoder_in_features = decoder_in_features + speaker_embedding_dim # add speaker embedding dim - if self.gst: - decoder_in_features = decoder_in_features + gst_embedding_dim # add gst embedding dim - - # embedding layer + speaker_embedding_dim = 512 if num_speakers > 1 else 0 + gst_embedding_dim = gst_embedding_dim if self.gst else 0 + decoder_in_features = 512+speaker_embedding_dim+gst_embedding_dim + encoder_in_features = 512 if num_speakers > 1 else 512 + proj_speaker_dim = 80 if num_speakers > 1 else 0 + # base layers self.embedding = nn.Embedding(num_chars, 512, padding_idx=0) # speaker embedding layer if num_speakers > 1: - if not self.embeddings_per_sample: - self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim) - self.speaker_embedding.weight.data.normal_(0, 0.3) - - # base model layers + self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim) + self.speaker_embedding.weight.data.normal_(0, 0.3) self.encoder = Encoder(encoder_in_features) self.decoder = Decoder(decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, @@ -99,7 +83,7 @@ class Tacotron2(TacotronAbstract): mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) return mel_outputs, mel_outputs_postnet, alignments - def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None, speaker_embeddings=None): + def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None): # compute mask for padding # B x T_in_max (boolean) input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) @@ -108,18 +92,20 @@ class Tacotron2(TacotronAbstract): # B x T_in_max x D_en encoder_outputs = self.encoder(embedded_inputs, text_lengths) - if self.gst: - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs) - if self.num_speakers > 1: - if not self.embeddings_per_sample: - # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] + embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) + if hasattr(self, 'gst'): + # B x gst_dim + encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, mel_specs) + encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1) else: - # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) + else: + if hasattr(self, 'gst'): + # B x gst_dim + encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, mel_specs) + encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1) encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) @@ -147,18 +133,24 @@ class Tacotron2(TacotronAbstract): return decoder_outputs, postnet_outputs, alignments, stop_tokens @torch.no_grad() - def inference(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None): + def inference(self, text, speaker_ids=None, style_mel=None): embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder.inference(embedded_inputs) - if self.gst: - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel) - if self.num_speakers > 1: - if not self.embeddings_per_sample: - speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] + embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) + if hasattr(self, 'gst'): + # B x gst_dim + encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) + encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1) + else: + encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) + else: + if hasattr(self, 'gst'): + # B x gst_dim + encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) + encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1) decoder_outputs, alignments, stop_tokens = self.decoder.inference( encoder_outputs) @@ -168,21 +160,27 @@ class Tacotron2(TacotronAbstract): decoder_outputs, postnet_outputs, alignments) return decoder_outputs, postnet_outputs, alignments, stop_tokens - def inference_truncated(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None): + def inference_truncated(self, text, speaker_ids=None, style_mel=None): """ Preserve model states for continuous inference """ embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder.inference_truncated(embedded_inputs) - if self.gst: - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel) - if self.num_speakers > 1: - if not self.embeddings_per_sample: - speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] + embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) + if hasattr(self, 'gst'): + # B x gst_dim + encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) + encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1) + else: + encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) + else: + if hasattr(self, 'gst'): + # B x gst_dim + encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) + encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1) mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated( encoder_outputs) diff --git a/mozilla_voice_tts/tts/models/tacotron_abstract.py b/mozilla_voice_tts/tts/models/tacotron_abstract.py index 6f3d32ad..bc794d49 100644 --- a/mozilla_voice_tts/tts/models/tacotron_abstract.py +++ b/mozilla_voice_tts/tts/models/tacotron_abstract.py @@ -165,7 +165,6 @@ class TacotronAbstract(ABC, nn.Module): self.speaker_embeddings).squeeze(1) def compute_gst(self, inputs, style_input): - """ Compute global style token """ device = inputs.device if isinstance(style_input, dict): query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device) @@ -176,11 +175,17 @@ class TacotronAbstract(ABC, nn.Module): gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) gst_outputs = gst_outputs + gst_outputs_att * v_amplifier elif style_input is None: + query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device) + _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) + for k_token in range(self.gst_style_tokens): + key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) + gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) + gst_outputs = gst_outputs + gst_outputs_att * 0 else: - gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable - inputs = self._concat_speaker_embedding(inputs, gst_outputs) - return inputs + gst_outputs = self.gst_layer(style_input) + embedded_gst = gst_outputs.repeat(1, inputs.size(1), 1) + return inputs, embedded_gst @staticmethod def _add_speaker_embedding(outputs, speaker_embeddings): diff --git a/mozilla_voice_tts/tts/utils/synthesis.py b/mozilla_voice_tts/tts/utils/synthesis.py index 2f746533..52b33e86 100644 --- a/mozilla_voice_tts/tts/utils/synthesis.py +++ b/mozilla_voice_tts/tts/utils/synthesis.py @@ -210,10 +210,13 @@ def synthesis(model, if backend == 'torch': if speaker_id is not None: speaker_id = id_to_torch(speaker_id, cuda=use_cuda) +<<<<<<< HEAD:mozilla_voice_tts/tts/utils/synthesis.py if speaker_embedding is not None: speaker_embedding = embedding_to_torch(speaker_embedding, cuda=use_cuda) +======= +>>>>>>> Added support for Tacotron2 GST + abbility to condition style input with wav or tokens:utils/synthesis.py if not isinstance(style_mel, dict): style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) inputs = numpy_to_torch(inputs, torch.long, cuda=use_cuda) diff --git a/synthesize.py b/synthesize.py new file mode 100644 index 00000000..bd720123 --- /dev/null +++ b/synthesize.py @@ -0,0 +1,182 @@ +# pylint: disable=redefined-outer-name, unused-argument +import os +import time +import argparse +import torch +import json +import string + +from TTS.utils.synthesis import synthesis +from TTS.utils.generic_utils import setup_model +from TTS.utils.io import load_config +from TTS.utils.text.symbols import make_symbols, symbols, phonemes +from TTS.utils.audio import AudioProcessor + + +def tts(model, + vocoder_model, + C, + VC, + text, + ap, + ap_vocoder, + use_cuda, + batched_vocoder, + speaker_id=None, + figures=False): + t_1 = time.time() + use_vocoder_model = vocoder_model is not None + waveform, alignment, _, postnet_output, stop_tokens, _ = synthesis( + model, text, C, use_cuda, ap, speaker_id, style_wav=C.gst['gst_style_input'], + truncated=False, enable_eos_bos_chars=C.enable_eos_bos_chars, + use_griffin_lim=(not use_vocoder_model), do_trim_silence=True) + + if C.model == "Tacotron" and use_vocoder_model: + postnet_output = ap.out_linear_to_mel(postnet_output.T).T + # correct if there is a scale difference b/w two models + if use_vocoder_model: + postnet_output = ap._denormalize(postnet_output) + postnet_output = ap_vocoder._normalize(postnet_output) + vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) + waveform = vocoder_model.generate( + vocoder_input.cuda() if use_cuda else vocoder_input, + batched=batched_vocoder, + target=8000, + overlap=400) + print(" > Run-time: {}".format(time.time() - t_1)) + return alignment, postnet_output, stop_tokens, waveform + + +if __name__ == "__main__": + + global symbols, phonemes + + parser = argparse.ArgumentParser() + parser.add_argument('text', type=str, help='Text to generate speech.') + parser.add_argument('config_path', + type=str, + help='Path to model config file.') + parser.add_argument( + 'model_path', + type=str, + help='Path to model file.', + ) + parser.add_argument( + 'out_path', + type=str, + help='Path to save final wav file. Wav file will be names as the text given.', + ) + parser.add_argument('--use_cuda', + type=bool, + help='Run model on CUDA.', + default=False) + parser.add_argument( + '--vocoder_path', + type=str, + help= + 'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).', + default="", + ) + parser.add_argument('--vocoder_config_path', + type=str, + help='Path to vocoder model config file.', + default="") + parser.add_argument( + '--batched_vocoder', + type=bool, + help="If True, vocoder model uses faster batch processing.", + default=True) + parser.add_argument('--speakers_json', + type=str, + help="JSON file for multi-speaker model.", + default="") + parser.add_argument( + '--speaker_id', + type=int, + help="target speaker_id if the model is multi-speaker.", + default=None) + args = parser.parse_args() + + if args.vocoder_path != "": + assert args.use_cuda, " [!] Enable cuda for vocoder." + from WaveRNN.models.wavernn import Model as VocoderModel + + # load the config + C = load_config(args.config_path) + C.forward_attn_mask = True + + # load the audio processor + ap = AudioProcessor(**C.audio) + + # if the vocabulary was passed, replace the default + if 'characters' in C.keys(): + symbols, phonemes = make_symbols(**C.characters) + + # load speakers + if args.speakers_json != '': + speakers = json.load(open(args.speakers_json, 'r')) + num_speakers = len(speakers) + else: + num_speakers = 0 + + # load the model + num_chars = len(phonemes) if C.use_phonemes else len(symbols) + model = setup_model(num_chars, num_speakers, C) + cp = torch.load(args.model_path) + model.load_state_dict(cp['model']) + model.eval() + if args.use_cuda: + model.cuda() + model.decoder.set_r(cp['r']) + + # load vocoder model + if args.vocoder_path != "": + VC = load_config(args.vocoder_config_path) + ap_vocoder = AudioProcessor(**VC.audio) + bits = 10 + vocoder_model = VocoderModel(rnn_dims=512, + fc_dims=512, + mode=VC.mode, + mulaw=VC.mulaw, + pad=VC.pad, + upsample_factors=VC.upsample_factors, + feat_dims=VC.audio["num_mels"], + compute_dims=128, + res_out_dims=128, + res_blocks=10, + hop_length=ap.hop_length, + sample_rate=ap.sample_rate, + use_aux_net=True, + use_upsample_net=True) + + check = torch.load(args.vocoder_path) + vocoder_model.load_state_dict(check['model']) + vocoder_model.eval() + if args.use_cuda: + vocoder_model.cuda() + else: + vocoder_model = None + VC = None + ap_vocoder = None + + # synthesize voice + print(" > Text: {}".format(args.text)) + _, _, _, wav = tts(model, + vocoder_model, + C, + VC, + args.text, + ap, + ap_vocoder, + args.use_cuda, + args.batched_vocoder, + speaker_id=args.speaker_id, + figures=False) + + # save the results + file_name = args.text.replace(" ", "_") + file_name = file_name.translate( + str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav' + out_path = os.path.join(args.out_path, file_name) + print(" > Saving output to {}".format(out_path)) + ap.save_wav(wav, out_path) diff --git a/tests/inputs/test_train_config.json b/tests/inputs/test_train_config.json index 951fe4a3..bea4cbb7 100644 --- a/tests/inputs/test_train_config.json +++ b/tests/inputs/test_train_config.json @@ -1,3 +1,4 @@ +<<<<<<< HEAD:tests/inputs/test_train_config.json { "model": "Tacotron2", "run_name": "test_sample_dataset_run", @@ -150,3 +151,161 @@ } +======= +{ + "model": "Tacotron2", + "run_name": "ljspeech-ddc-bn", + "run_description": "tacotron2 with ddc and batch-normalization", + + // AUDIO PARAMETERS + "audio":{ + // stft parameters + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (true), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // Griffin-Lim + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20, + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // VOCABULARY PARAMETERS + // if custom character set is not defined, + // default set in symbols.py is used + // "characters":{ + // "pad": "_", + // "eos": "~", + // "bos": "^", + // "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + // "punctuations":"!'(),-.:;? ", + // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" + // }, + + // DISTRIBUTED TRAINING + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size":16, + "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. + "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // OPTIMIZER + "noam_schedule": false, // use noam warmup and lr schedule. + "grad_clip": 1.0, // upper limit for gradients for clipping. + "epochs": 1000, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "wd": 0.000001, // Weight decay weight. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. + + // TACOTRON PRENET + "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. + "prenet_type": "bn", // "original" or "bn". + "prenet_dropout": false, // enable/disable dropout at prenet. + + // TACOTRON ATTENTION + "attention_type": "original", // 'original' or 'graves' + "attention_heads": 4, // number of attention heads (only for 'graves') + "attention_norm": "sigmoid", // softmax or sigmoid. + "windowing": false, // Enables attention windowing. Used only in eval mode. + "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. + "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. + "double_decoder_consistency": true, // use DDC explained here https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency-draft/ + "ddc_r": 7, // reduction rate for coarse decoder. + + // STOPNET + "stopnet": true, // Train stopnet predicting the end of synthesis. + "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log training on console. + "tb_plot_step:": 100, // Number of steps to plot TB training figures. + "print_eval": false, // If True, it prints intermediate loss values in evalulation. + "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "text_cleaner": "phoneme_cleaners", + "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "batch_group_size": 0, //Number of batches to shuffle after bucketing. + "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training + "max_seq_len": 153, // DATASET-RELATED: maximum text length + + // PATHS + "output_path": "/home/erogol/Models/LJSpeech/", + + // PHONEMES + "phoneme_cache_path": "/media/erogol/data_ssd2/mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder. + "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. + "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages + + // MULTI-SPEAKER and GST + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + "use_gst": true, // use global style tokens + "gst": { // gst parameter if gst is enabled + "gst_style_input": null, // Condition the style input either on a + // -> wave file [path to wave] or + // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} + // with the dictionary being len(dict) == len(gst_style_tokens). + "gst_embedding_dim": 512, + "gst_num_heads": 4, + "gst_style_tokens": 10 + }, + + // DATASETS + "datasets": // List of datasets. They all merged and they get different speaker_ids. + [ + { + "name": "ljspeech", + "path": "/home/erogol/Data/LJSpeech-1.1/", + "meta_file_train": "metadata.csv", + "meta_file_val": null + } + ] +} + +>>>>>>> Added support for Tacotron2 GST + abbility to condition style input with wav or tokens:config.json diff --git a/utils/generic_utils.py b/utils/generic_utils.py new file mode 100644 index 00000000..8b4b1f12 --- /dev/null +++ b/utils/generic_utils.py @@ -0,0 +1,374 @@ +import os +import glob +import torch +import shutil +import datetime +import subprocess +import importlib +import numpy as np +from collections import Counter + + +def get_git_branch(): + try: + out = subprocess.check_output(["git", "branch"]).decode("utf8") + current = next(line for line in out.split("\n") + if line.startswith("*")) + current.replace("* ", "") + except subprocess.CalledProcessError: + current = "inside_docker" + return current + + +def get_commit_hash(): + """https://stackoverflow.com/questions/14989858/get-the-current-git-hash-in-a-python-script""" + # try: + # subprocess.check_output(['git', 'diff-index', '--quiet', + # 'HEAD']) # Verify client is clean + # except: + # raise RuntimeError( + # " !! Commit before training to get the commit hash.") + try: + commit = subprocess.check_output( + ['git', 'rev-parse', '--short', 'HEAD']).decode().strip() + # Not copying .git folder into docker container + except subprocess.CalledProcessError: + commit = "0000000" + print(' > Git Hash: {}'.format(commit)) + return commit + + +def create_experiment_folder(root_path, model_name, debug): + """ Create a folder with the current date and time """ + date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p") + if debug: + commit_hash = 'debug' + else: + commit_hash = get_commit_hash() + output_folder = os.path.join( + root_path, model_name + '-' + date_str + '-' + commit_hash) + os.makedirs(output_folder, exist_ok=True) + print(" > Experiment folder: {}".format(output_folder)) + return output_folder + + +def remove_experiment_folder(experiment_path): + """Check folder if there is a checkpoint, otherwise remove the folder""" + + checkpoint_files = glob.glob(experiment_path + "/*.pth.tar") + if not checkpoint_files: + if os.path.exists(experiment_path): + shutil.rmtree(experiment_path, ignore_errors=True) + print(" ! Run is removed from {}".format(experiment_path)) + else: + print(" ! Run is kept in {}".format(experiment_path)) + + +def count_parameters(model): + r"""Count number of trainable parameters in a network""" + return sum(p.numel() for p in model.parameters() if p.requires_grad) + + +def split_dataset(items): + is_multi_speaker = False + speakers = [item[-1] for item in items] + is_multi_speaker = len(set(speakers)) > 1 + eval_split_size = 500 if len(items) * 0.01 > 500 else int( + len(items) * 0.01) + assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples." + np.random.seed(0) + np.random.shuffle(items) + if is_multi_speaker: + items_eval = [] + # most stupid code ever -- Fix it ! + while len(items_eval) < eval_split_size: + speakers = [item[-1] for item in items] + speaker_counter = Counter(speakers) + item_idx = np.random.randint(0, len(items)) + if speaker_counter[items[item_idx][-1]] > 1: + items_eval.append(items[item_idx]) + del items[item_idx] + return items_eval, items + return items[:eval_split_size], items[eval_split_size:] + + +# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 +def sequence_mask(sequence_length, max_len=None): + if max_len is None: + max_len = sequence_length.data.max() + batch_size = sequence_length.size(0) + seq_range = torch.arange(0, max_len).long() + seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) + if sequence_length.is_cuda: + seq_range_expand = seq_range_expand.to(sequence_length.device) + seq_length_expand = ( + sequence_length.unsqueeze(1).expand_as(seq_range_expand)) + # B x T_max + return seq_range_expand < seq_length_expand + + +def set_init_dict(model_dict, checkpoint_state, c): + # Partial initialization: if there is a mismatch with new and old layer, it is skipped. + for k, v in checkpoint_state.items(): + if k not in model_dict: + print(" | > Layer missing in the model definition: {}".format(k)) + # 1. filter out unnecessary keys + pretrained_dict = { + k: v + for k, v in checkpoint_state.items() if k in model_dict + } + # 2. filter out different size layers + pretrained_dict = { + k: v + for k, v in pretrained_dict.items() + if v.numel() == model_dict[k].numel() + } + # 3. skip reinit layers + if c.reinit_layers is not None: + for reinit_layer_name in c.reinit_layers: + pretrained_dict = { + k: v + for k, v in pretrained_dict.items() + if reinit_layer_name not in k + } + # 4. overwrite entries in the existing state dict + model_dict.update(pretrained_dict) + print(" | > {} / {} layers are restored.".format(len(pretrained_dict), + len(model_dict))) + return model_dict + + +def setup_model(num_chars, num_speakers, c): + print(" > Using model: {}".format(c.model)) + MyModel = importlib.import_module('TTS.models.' + c.model.lower()) + MyModel = getattr(MyModel, c.model) + if c.model.lower() in "tacotron": + model = MyModel(num_chars=num_chars, + num_speakers=num_speakers, + r=c.r, + postnet_output_dim=int(c.audio['fft_size'] / 2 + 1), + decoder_output_dim=c.audio['num_mels'], + gst=c.use_gst, + gst_embedding_dim=c.gst['gst_embedding_dim'], + gst_num_heads=c.gst['gst_num_heads'], + gst_style_tokens=c.gst['gst_style_tokens'], + memory_size=c.memory_size, + attn_type=c.attention_type, + attn_win=c.windowing, + attn_norm=c.attention_norm, + prenet_type=c.prenet_type, + prenet_dropout=c.prenet_dropout, + forward_attn=c.use_forward_attn, + trans_agent=c.transition_agent, + forward_attn_mask=c.forward_attn_mask, + location_attn=c.location_attn, + attn_K=c.attention_heads, + separate_stopnet=c.separate_stopnet, + bidirectional_decoder=c.bidirectional_decoder, + double_decoder_consistency=c.double_decoder_consistency, + ddc_r=c.ddc_r) + elif c.model.lower() == "tacotron2": + model = MyModel(num_chars=num_chars, + num_speakers=num_speakers, + r=c.r, + postnet_output_dim=c.audio['num_mels'], + decoder_output_dim=c.audio['num_mels'], + gst=c.use_gst, + gst_embedding_dim=c.gst['gst_embedding_dim'], + gst_num_heads=c.gst['gst_num_heads'], + gst_style_tokens=c.gst['gst_style_tokens'], + attn_type=c.attention_type, + attn_win=c.windowing, + attn_norm=c.attention_norm, + prenet_type=c.prenet_type, + prenet_dropout=c.prenet_dropout, + forward_attn=c.use_forward_attn, + trans_agent=c.transition_agent, + forward_attn_mask=c.forward_attn_mask, + location_attn=c.location_attn, + attn_K=c.attention_heads, + separate_stopnet=c.separate_stopnet, + bidirectional_decoder=c.bidirectional_decoder, + double_decoder_consistency=c.double_decoder_consistency, + ddc_r=c.ddc_r) + return model + +class KeepAverage(): + def __init__(self): + self.avg_values = {} + self.iters = {} + + def __getitem__(self, key): + return self.avg_values[key] + + def items(self): + return self.avg_values.items() + + def add_value(self, name, init_val=0, init_iter=0): + self.avg_values[name] = init_val + self.iters[name] = init_iter + + def update_value(self, name, value, weighted_avg=False): + if name not in self.avg_values: + # add value if not exist before + self.add_value(name, init_val=value) + else: + # else update existing value + if weighted_avg: + self.avg_values[name] = 0.99 * self.avg_values[name] + 0.01 * value + self.iters[name] += 1 + else: + self.avg_values[name] = self.avg_values[name] * \ + self.iters[name] + value + self.iters[name] += 1 + self.avg_values[name] /= self.iters[name] + + def add_values(self, name_dict): + for key, value in name_dict.items(): + self.add_value(key, init_val=value) + + def update_values(self, value_dict): + for key, value in value_dict.items(): + self.update_value(key, value) + + +def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restricted=False, val_type=None, alternative=None): + if alternative in c.keys() and c[alternative] is not None: + return + if restricted: + assert name in c.keys(), f' [!] {name} not defined in config.json' + if name in c.keys(): + if max_val: + assert c[name] <= max_val, f' [!] {name} is larger than max value {max_val}' + if min_val: + assert c[name] >= min_val, f' [!] {name} is smaller than min value {min_val}' + if enum_list: + assert c[name].lower() in enum_list, f' [!] {name} is not a valid value' + if val_type: + assert isinstance(c[name], val_type) or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}' + + +def check_config(c): + _check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str) + _check_argument('run_name', c, restricted=True, val_type=str) + _check_argument('run_description', c, val_type=str) + + # AUDIO + _check_argument('audio', c, restricted=True, val_type=dict) + + # audio processing parameters + _check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056) + _check_argument('fft_size', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058) + _check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000) + _check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length') + _check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length') + _check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1) + _check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10) + _check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000) + _check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5) + _check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000) + + # vocabulary parameters + _check_argument('characters', c, restricted=False, val_type=dict) + _check_argument('pad', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) + _check_argument('eos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) + _check_argument('bos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) + _check_argument('characters', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) + _check_argument('phonemes', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) + _check_argument('punctuations', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) + + # normalization parameters + _check_argument('signal_norm', c['audio'], restricted=True, val_type=bool) + _check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool) + _check_argument('max_norm', c['audio'], restricted=True, val_type=float, min_val=0.1, max_val=1000) + _check_argument('clip_norm', c['audio'], restricted=True, val_type=bool) + _check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000) + _check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0) + _check_argument('spec_gain', c['audio'], restricted=True, val_type=float, min_val=1, max_val=100) + _check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool) + _check_argument('trim_db', c['audio'], restricted=True, val_type=int) + + # training parameters + _check_argument('batch_size', c, restricted=True, val_type=int, min_val=1) + _check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1) + _check_argument('r', c, restricted=True, val_type=int, min_val=1) + _check_argument('gradual_training', c, restricted=False, val_type=list) + _check_argument('loss_masking', c, restricted=True, val_type=bool) + # _check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100) + + # validation parameters + _check_argument('run_eval', c, restricted=True, val_type=bool) + _check_argument('test_delay_epochs', c, restricted=True, val_type=int, min_val=0) + _check_argument('test_sentences_file', c, restricted=False, val_type=str) + + # optimizer + _check_argument('noam_schedule', c, restricted=False, val_type=bool) + _check_argument('grad_clip', c, restricted=True, val_type=float, min_val=0.0) + _check_argument('epochs', c, restricted=True, val_type=int, min_val=1) + _check_argument('lr', c, restricted=True, val_type=float, min_val=0) + _check_argument('wd', c, restricted=True, val_type=float, min_val=0) + _check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0) + _check_argument('seq_len_norm', c, restricted=True, val_type=bool) + + # tacotron prenet + _check_argument('memory_size', c, restricted=True, val_type=int, min_val=-1) + _check_argument('prenet_type', c, restricted=True, val_type=str, enum_list=['original', 'bn']) + _check_argument('prenet_dropout', c, restricted=True, val_type=bool) + + # attention + _check_argument('attention_type', c, restricted=True, val_type=str, enum_list=['graves', 'original']) + _check_argument('attention_heads', c, restricted=True, val_type=int) + _check_argument('attention_norm', c, restricted=True, val_type=str, enum_list=['sigmoid', 'softmax']) + _check_argument('windowing', c, restricted=True, val_type=bool) + _check_argument('use_forward_attn', c, restricted=True, val_type=bool) + _check_argument('forward_attn_mask', c, restricted=True, val_type=bool) + _check_argument('transition_agent', c, restricted=True, val_type=bool) + _check_argument('transition_agent', c, restricted=True, val_type=bool) + _check_argument('location_attn', c, restricted=True, val_type=bool) + _check_argument('bidirectional_decoder', c, restricted=True, val_type=bool) + _check_argument('double_decoder_consistency', c, restricted=True, val_type=bool) + _check_argument('ddc_r', c, restricted='double_decoder_consistency' in c.keys(), min_val=1, max_val=7, val_type=int) + + # stopnet + _check_argument('stopnet', c, restricted=True, val_type=bool) + _check_argument('separate_stopnet', c, restricted=True, val_type=bool) + + # tensorboard + _check_argument('print_step', c, restricted=True, val_type=int, min_val=1) + _check_argument('tb_plot_step', c, restricted=True, val_type=int, min_val=1) + _check_argument('save_step', c, restricted=True, val_type=int, min_val=1) + _check_argument('checkpoint', c, restricted=True, val_type=bool) + _check_argument('tb_model_param_stats', c, restricted=True, val_type=bool) + + # dataloading + # pylint: disable=import-outside-toplevel + from TTS.utils.text import cleaners + _check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=dir(cleaners)) + _check_argument('enable_eos_bos_chars', c, restricted=True, val_type=bool) + _check_argument('num_loader_workers', c, restricted=True, val_type=int, min_val=0) + _check_argument('num_val_loader_workers', c, restricted=True, val_type=int, min_val=0) + _check_argument('batch_group_size', c, restricted=True, val_type=int, min_val=0) + _check_argument('min_seq_len', c, restricted=True, val_type=int, min_val=0) + _check_argument('max_seq_len', c, restricted=True, val_type=int, min_val=10) + + # paths + _check_argument('output_path', c, restricted=True, val_type=str) + + # multi-speaker + _check_argument('use_speaker_embedding', c, restricted=True, val_type=bool) + + # GST + _check_argument('use_gst', c, restricted=True, val_type=bool) + _check_argument('gst_style_input', c, restricted=True, val_type=str) + _check_argument('gst', c, restricted=True, val_type=dict) + _check_argument('gst_embedding_dim', c['gst'], restricted=True, val_type=int, min_val=1) + _check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=1) + _check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1) + + # datasets - checking only the first entry + _check_argument('datasets', c, restricted=True, val_type=list) + for dataset_entry in c['datasets']: + _check_argument('name', dataset_entry, restricted=True, val_type=str) + _check_argument('path', dataset_entry, restricted=True, val_type=str) + _check_argument('meta_file_train', dataset_entry, restricted=True, val_type=str) + _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str) From c2d8a338a1254faade08a5b17645f8fa83d0b24c Mon Sep 17 00:00:00 2001 From: SanjaESC Date: Fri, 10 Jul 2020 12:46:43 +0200 Subject: [PATCH 19/56] No need to query every token when none were passed --- mozilla_voice_tts/tts/models/tacotron_abstract.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mozilla_voice_tts/tts/models/tacotron_abstract.py b/mozilla_voice_tts/tts/models/tacotron_abstract.py index bc794d49..13c3e948 100644 --- a/mozilla_voice_tts/tts/models/tacotron_abstract.py +++ b/mozilla_voice_tts/tts/models/tacotron_abstract.py @@ -175,13 +175,7 @@ class TacotronAbstract(ABC, nn.Module): gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) gst_outputs = gst_outputs + gst_outputs_att * v_amplifier elif style_input is None: - query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device) - _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) - for k_token in range(self.gst_style_tokens): - key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) - gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) - gst_outputs = gst_outputs + gst_outputs_att * 0 else: gst_outputs = self.gst_layer(style_input) embedded_gst = gst_outputs.repeat(1, inputs.size(1), 1) From 447176258c5a91518e9a76858b6e8bb1f3dcc262 Mon Sep 17 00:00:00 2001 From: SanjaESC Date: Sun, 12 Jul 2020 10:40:33 +0200 Subject: [PATCH 20/56] fix fft_size key error --- mozilla_voice_tts/tts/models/tacotron_abstract.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mozilla_voice_tts/tts/models/tacotron_abstract.py b/mozilla_voice_tts/tts/models/tacotron_abstract.py index 13c3e948..d1148be5 100644 --- a/mozilla_voice_tts/tts/models/tacotron_abstract.py +++ b/mozilla_voice_tts/tts/models/tacotron_abstract.py @@ -177,6 +177,7 @@ class TacotronAbstract(ABC, nn.Module): elif style_input is None: gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) else: + # pylint: disable=not-callable gst_outputs = self.gst_layer(style_input) embedded_gst = gst_outputs.repeat(1, inputs.size(1), 1) return inputs, embedded_gst From 89918c6e5351506395fad0545ca5af1cfda08d59 Mon Sep 17 00:00:00 2001 From: SanjaESC Date: Sun, 12 Jul 2020 12:33:13 +0200 Subject: [PATCH 21/56] pylint --- mozilla_voice_tts/tts/models/tacotron_abstract.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mozilla_voice_tts/tts/models/tacotron_abstract.py b/mozilla_voice_tts/tts/models/tacotron_abstract.py index d1148be5..9b2ef148 100644 --- a/mozilla_voice_tts/tts/models/tacotron_abstract.py +++ b/mozilla_voice_tts/tts/models/tacotron_abstract.py @@ -177,8 +177,7 @@ class TacotronAbstract(ABC, nn.Module): elif style_input is None: gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) else: - # pylint: disable=not-callable - gst_outputs = self.gst_layer(style_input) + gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable embedded_gst = gst_outputs.repeat(1, inputs.size(1), 1) return inputs, embedded_gst From 77bfb881d74e753e25e2b40f8fa389620d169eca Mon Sep 17 00:00:00 2001 From: SanjaESC Date: Sun, 12 Jul 2020 14:07:44 +0200 Subject: [PATCH 22/56] tacotrongst test + test fixes --- mozilla_voice_tts/tts/models/tacotron2.py | 12 ++--- tests/outputs/dummy_model_config.json | 1 + tests/test_tacotron2_model.py | 56 +++++++++++++++++++++++ 3 files changed, 63 insertions(+), 6 deletions(-) diff --git a/mozilla_voice_tts/tts/models/tacotron2.py b/mozilla_voice_tts/tts/models/tacotron2.py index 7effdb3d..18eb17d1 100644 --- a/mozilla_voice_tts/tts/models/tacotron2.py +++ b/mozilla_voice_tts/tts/models/tacotron2.py @@ -95,14 +95,14 @@ class Tacotron2(TacotronAbstract): if self.num_speakers > 1: embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) - if hasattr(self, 'gst'): + if self.gst: # B x gst_dim encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, mel_specs) encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1) else: encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) else: - if hasattr(self, 'gst'): + if self.gst: # B x gst_dim encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, mel_specs) encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1) @@ -140,14 +140,14 @@ class Tacotron2(TacotronAbstract): if self.num_speakers > 1: embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) - if hasattr(self, 'gst'): + if self.gst: # B x gst_dim encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1) else: encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) else: - if hasattr(self, 'gst'): + if self.gst: # B x gst_dim encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1) @@ -170,14 +170,14 @@ class Tacotron2(TacotronAbstract): if self.num_speakers > 1: embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) - if hasattr(self, 'gst'): + if self.gst: # B x gst_dim encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1) else: encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) else: - if hasattr(self, 'gst'): + if self.gst: # B x gst_dim encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1) diff --git a/tests/outputs/dummy_model_config.json b/tests/outputs/dummy_model_config.json index bf46be1c..b032f191 100644 --- a/tests/outputs/dummy_model_config.json +++ b/tests/outputs/dummy_model_config.json @@ -99,3 +99,4 @@ } } + diff --git a/tests/test_tacotron2_model.py b/tests/test_tacotron2_model.py index d4d5eb86..c6d08160 100644 --- a/tests/test_tacotron2_model.py +++ b/tests/test_tacotron2_model.py @@ -75,6 +75,62 @@ class TacotronTrainTest(unittest.TestCase): count, param.shape, param, param_ref) count += 1 + +class TacotronGSTTrainTest(unittest.TestCase): + def test_train_step(self): + input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) + input_lengths = torch.randint(100, 128, (8, )).long().to(device) + input_lengths = torch.sort(input_lengths, descending=True)[0] + mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) + mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) + mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + mel_lengths[0] = 30 + stop_targets = torch.zeros(8, 30, 1).float().to(device) + speaker_ids = torch.randint(0, 5, (8, )).long().to(device) + + for idx in mel_lengths: + stop_targets[:, int(idx.item()):, 0] = 1.0 + + stop_targets = stop_targets.view(input_dummy.shape[0], + stop_targets.size(1) // c.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() + + criterion = MSELossMasked(seq_len_norm=False).to(device) + criterion_st = nn.BCEWithLogitsLoss().to(device) + model = Tacotron2(num_chars=24, + gst=True, + r=c.r, + num_speakers=5).to(device) + model.train() + model_ref = copy.deepcopy(model) + count = 0 + for param, param_ref in zip(model.parameters(), + model_ref.parameters()): + assert (param - param_ref).sum() == 0, param + count += 1 + optimizer = optim.Adam(model.parameters(), lr=c.lr) + for i in range(5): + mel_out, mel_postnet_out, align, stop_tokens = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids) + assert torch.sigmoid(stop_tokens).data.max() <= 1.0 + assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + optimizer.zero_grad() + loss = criterion(mel_out, mel_spec, mel_lengths) + stop_loss = criterion_st(stop_tokens, stop_targets) + loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss.backward() + optimizer.step() + # check parameter changes + count = 0 + for param, param_ref in zip(model.parameters(), + model_ref.parameters()): + # ignore pre-higway layer since it works conditional + # if count not in [145, 59]: + assert (param != param_ref).any( + ), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref) + count += 1 + class TacotronGSTTrainTest(unittest.TestCase): @staticmethod def test_train_step(): From 1436206224630253c42c153a00e20c7067eb7573 Mon Sep 17 00:00:00 2001 From: SanjaESC Date: Mon, 13 Jul 2020 08:50:39 +0200 Subject: [PATCH 23/56] override compute_gst in tacotron2 model --- mozilla_voice_tts/tts/models/tacotron2.py | 22 ++++++++++++++++++- .../tts/models/tacotron_abstract.py | 22 +++++-------------- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/mozilla_voice_tts/tts/models/tacotron2.py b/mozilla_voice_tts/tts/models/tacotron2.py index 18eb17d1..0aa237ff 100644 --- a/mozilla_voice_tts/tts/models/tacotron2.py +++ b/mozilla_voice_tts/tts/models/tacotron2.py @@ -46,13 +46,15 @@ class Tacotron2(TacotronAbstract): decoder_in_features = 512+speaker_embedding_dim+gst_embedding_dim encoder_in_features = 512 if num_speakers > 1 else 512 proj_speaker_dim = 80 if num_speakers > 1 else 0 - # base layers + + # embedding layer self.embedding = nn.Embedding(num_chars, 512, padding_idx=0) # speaker embedding layer if num_speakers > 1: self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) + self.encoder = Encoder(encoder_in_features) self.decoder = Decoder(decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, @@ -83,6 +85,24 @@ class Tacotron2(TacotronAbstract): mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) return mel_outputs, mel_outputs_postnet, alignments + def compute_gst(self, inputs, style_input): + """ Compute global style token """ + device = inputs.device + if isinstance(style_input, dict): + query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device) + _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) + gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) + for k_token, v_amplifier in style_input.items(): + key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) + gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) + gst_outputs = gst_outputs + gst_outputs_att * v_amplifier + elif style_input is None: + gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) + else: + gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable + embedded_gst = gst_outputs.repeat(1, inputs.size(1), 1) + return inputs, embedded_gst + def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None): # compute mask for padding # B x T_in_max (boolean) diff --git a/mozilla_voice_tts/tts/models/tacotron_abstract.py b/mozilla_voice_tts/tts/models/tacotron_abstract.py index 9b2ef148..a4b8c227 100644 --- a/mozilla_voice_tts/tts/models/tacotron_abstract.py +++ b/mozilla_voice_tts/tts/models/tacotron_abstract.py @@ -164,22 +164,12 @@ class TacotronAbstract(ABC, nn.Module): self.speaker_embeddings_projected = self.speaker_project_mel( self.speaker_embeddings).squeeze(1) - def compute_gst(self, inputs, style_input): - device = inputs.device - if isinstance(style_input, dict): - query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device) - _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) - gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) - for k_token, v_amplifier in style_input.items(): - key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) - gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) - gst_outputs = gst_outputs + gst_outputs_att * v_amplifier - elif style_input is None: - gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) - else: - gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable - embedded_gst = gst_outputs.repeat(1, inputs.size(1), 1) - return inputs, embedded_gst + def compute_gst(self, inputs, mel_specs): + """ Compute global style token """ + # pylint: disable=not-callable + gst_outputs = self.gst_layer(mel_specs) + inputs = self._add_speaker_embedding(inputs, gst_outputs) + return inputs @staticmethod def _add_speaker_embedding(outputs, speaker_embeddings): From c4828b2b9e94fa70e5f8097aae405077fbb24aca Mon Sep 17 00:00:00 2001 From: SanjaESC Date: Mon, 13 Jul 2020 08:51:37 +0200 Subject: [PATCH 24/56] small gst config change --- tests/test_tacotron2_model.py | 2 +- utils/generic_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_tacotron2_model.py b/tests/test_tacotron2_model.py index c6d08160..92ffb9aa 100644 --- a/tests/test_tacotron2_model.py +++ b/tests/test_tacotron2_model.py @@ -239,4 +239,4 @@ class TacotronGSTTrainTest(unittest.TestCase): assert (param != param_ref).any( ), "param {} {} with shape {} not updated!! \n{}\n{}".format( name, count, param.shape, param, param_ref) - count += 1 \ No newline at end of file + count += 1 diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 8b4b1f12..3bb99e08 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -359,8 +359,8 @@ def check_config(c): # GST _check_argument('use_gst', c, restricted=True, val_type=bool) - _check_argument('gst_style_input', c, restricted=True, val_type=str) _check_argument('gst', c, restricted=True, val_type=dict) + _check_argument('gst_style_input', c['gst'], restricted=True, val_type=str) _check_argument('gst_embedding_dim', c['gst'], restricted=True, val_type=int, min_val=1) _check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=1) _check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1) From 70c665b9c4c3716059b1bb774f93f775771eead4 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 27 Jul 2020 16:59:59 -0300 Subject: [PATCH 25/56] add support fot VCTK and BRSpeech dataset --- mozilla_voice_tts/tts/datasets/preprocess.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mozilla_voice_tts/tts/datasets/preprocess.py b/mozilla_voice_tts/tts/datasets/preprocess.py index 317673e3..0509677c 100644 --- a/mozilla_voice_tts/tts/datasets/preprocess.py +++ b/mozilla_voice_tts/tts/datasets/preprocess.py @@ -229,14 +229,14 @@ def vctk(root_path, meta_files=None, wavs_path='wav48'): items = [] meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) for meta_file in meta_files: - _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) + txt, speaker_id, txt_file = os.path.relpath(meta_file,root_path).split(os.sep) file_id = txt_file.split('.')[0] if isinstance(test_speakers, list): # if is list ignore this speakers ids if speaker_id in test_speakers: continue with open(meta_file) as file_text: text = file_text.readlines()[0] - wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id+'.wav') + wav_file = os.path.join(root_path, wavs_path, speaker_id,file_id+'.wav') items.append([text, wav_file, speaker_id]) - + return items \ No newline at end of file From be77e24a39dcc65c842096d9c289b4091811ddcf Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 28 Jul 2020 17:11:32 -0300 Subject: [PATCH 26/56] bugfix in DDC now DDC work on Tacotron1 --- mozilla_voice_tts/tts/configs/config.json | 8 ++- mozilla_voice_tts/tts/models/tacotron.py | 18 +++---- mozilla_voice_tts/tts/models/tacotron2.py | 54 ++++++------------- .../tts/models/tacotron_abstract.py | 19 +++++-- mozilla_voice_tts/tts/utils/generic_utils.py | 6 +++ 5 files changed, 52 insertions(+), 53 deletions(-) diff --git a/mozilla_voice_tts/tts/configs/config.json b/mozilla_voice_tts/tts/configs/config.json index 090540ab..70529fea 100644 --- a/mozilla_voice_tts/tts/configs/config.json +++ b/mozilla_voice_tts/tts/configs/config.json @@ -1,5 +1,5 @@ { - "model": "Tacotron2", + "model": "Tacotron", "run_name": "ljspeech-ddc-bn", "run_description": "tacotron2 with ddc and batch-normalization", @@ -114,7 +114,7 @@ "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING - "text_cleaner": "phoneme_cleaners", + "text_cleaner": "portuguese_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_val_loader_workers": 4, // number of evaluation data loader processes. @@ -131,9 +131,13 @@ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages // MULTI-SPEAKER and GST +<<<<<<< HEAD:mozilla_voice_tts/tts/configs/config.json "use_speaker_embedding": true, // use speaker embedding to enable multi-speaker learning. "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 +======= + "use_speaker_embedding": true, // use speaker embedding to enable multi-speaker learning. +>>>>>>> bugfix in DDC now DDC work on Tacotron1:TTS/tts/configs/config.json "use_gst": true, // use global style tokens "gst": { // gst parameter if gst is enabled "gst_style_input": null, // Condition the style input either on a diff --git a/mozilla_voice_tts/tts/models/tacotron.py b/mozilla_voice_tts/tts/models/tacotron.py index 682d2b59..1395de97 100644 --- a/mozilla_voice_tts/tts/models/tacotron.py +++ b/mozilla_voice_tts/tts/models/tacotron.py @@ -42,19 +42,13 @@ class Tacotron(TacotronAbstract): bidirectional_decoder, double_decoder_consistency, ddc_r, gst) + # init layer dims decoder_in_features = 256 encoder_in_features = 256 + speaker_embedding_dim = 256 + proj_speaker_dim = 80 if num_speakers > 1 else 0 - if speaker_embedding_dim is None: - # if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim - self.embeddings_per_sample = False - speaker_embedding_dim = 256 - else: - # if speaker_embedding_dim is not None we need use speaker embedding per sample - self.embeddings_per_sample = True - - # speaker and gst embeddings is concat in decoder input if num_speakers > 1: decoder_in_features = decoder_in_features + speaker_embedding_dim # add speaker embedding dim if self.gst: @@ -109,6 +103,9 @@ class Tacotron(TacotronAbstract): input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) # B x T_in x embed_dim inputs = self.embedding(characters) + # B x speaker_embed_dim + if speaker_ids is not None: + self.compute_speaker_embedding(speaker_ids) # B x T_in x encoder_in_features encoder_outputs = self.encoder(inputs) # sequence masking @@ -155,6 +152,9 @@ class Tacotron(TacotronAbstract): @torch.no_grad() def inference(self, characters, speaker_ids=None, style_mel=None, speaker_embeddings=None): inputs = self.embedding(characters) + self._init_states() + if speaker_ids is not None: + self.compute_speaker_embedding(speaker_ids) encoder_outputs = self.encoder(inputs) if self.gst: # B x gst_dim diff --git a/mozilla_voice_tts/tts/models/tacotron2.py b/mozilla_voice_tts/tts/models/tacotron2.py index 0aa237ff..47057c56 100644 --- a/mozilla_voice_tts/tts/models/tacotron2.py +++ b/mozilla_voice_tts/tts/models/tacotron2.py @@ -1,9 +1,15 @@ import torch from torch import nn +<<<<<<< HEAD:mozilla_voice_tts/tts/models/tacotron2.py from mozilla_voice_tts.tts.layers.gst_layers import GST from mozilla_voice_tts.tts.layers.tacotron2 import Decoder, Encoder, Postnet from mozilla_voice_tts.tts.models.tacotron_abstract import TacotronAbstract +======= +from TTS.tts.layers.gst_layers import GST +from TTS.tts.layers.tacotron2 import Decoder, Encoder, Postnet +from TTS.tts.models.tacotron_abstract import TacotronAbstract +>>>>>>> bugfix in DDC now DDC work on Tacotron1:TTS/tts/models/tacotron2.py # TODO: match function arguments with tacotron class Tacotron2(TacotronAbstract): @@ -85,24 +91,6 @@ class Tacotron2(TacotronAbstract): mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) return mel_outputs, mel_outputs_postnet, alignments - def compute_gst(self, inputs, style_input): - """ Compute global style token """ - device = inputs.device - if isinstance(style_input, dict): - query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device) - _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) - gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) - for k_token, v_amplifier in style_input.items(): - key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) - gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) - gst_outputs = gst_outputs + gst_outputs_att * v_amplifier - elif style_input is None: - gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) - else: - gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable - embedded_gst = gst_outputs.repeat(1, inputs.size(1), 1) - return inputs, embedded_gst - def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None): # compute mask for padding # B x T_in_max (boolean) @@ -112,20 +100,13 @@ class Tacotron2(TacotronAbstract): # B x T_in_max x D_en encoder_outputs = self.encoder(embedded_inputs, text_lengths) + if self.gst: + # B x gst_dim + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs) + if self.num_speakers > 1: embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] - embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) - if self.gst: - # B x gst_dim - encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, mel_specs) - encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1) - else: - encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) - else: - if self.gst: - # B x gst_dim - encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, mel_specs) - encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) @@ -162,15 +143,14 @@ class Tacotron2(TacotronAbstract): embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) if self.gst: # B x gst_dim - encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) - encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1) + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) + encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) else: encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) else: if self.gst: # B x gst_dim - encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) - encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1) + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) decoder_outputs, alignments, stop_tokens = self.decoder.inference( encoder_outputs) @@ -192,15 +172,13 @@ class Tacotron2(TacotronAbstract): embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) if self.gst: # B x gst_dim - encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) - encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1) + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) else: encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) else: if self.gst: # B x gst_dim - encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) - encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1) + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated( encoder_outputs) diff --git a/mozilla_voice_tts/tts/models/tacotron_abstract.py b/mozilla_voice_tts/tts/models/tacotron_abstract.py index a4b8c227..6f3d32ad 100644 --- a/mozilla_voice_tts/tts/models/tacotron_abstract.py +++ b/mozilla_voice_tts/tts/models/tacotron_abstract.py @@ -164,11 +164,22 @@ class TacotronAbstract(ABC, nn.Module): self.speaker_embeddings_projected = self.speaker_project_mel( self.speaker_embeddings).squeeze(1) - def compute_gst(self, inputs, mel_specs): + def compute_gst(self, inputs, style_input): """ Compute global style token """ - # pylint: disable=not-callable - gst_outputs = self.gst_layer(mel_specs) - inputs = self._add_speaker_embedding(inputs, gst_outputs) + device = inputs.device + if isinstance(style_input, dict): + query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device) + _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) + gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) + for k_token, v_amplifier in style_input.items(): + key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) + gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) + gst_outputs = gst_outputs + gst_outputs_att * v_amplifier + elif style_input is None: + gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) + else: + gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable + inputs = self._concat_speaker_embedding(inputs, gst_outputs) return inputs @staticmethod diff --git a/mozilla_voice_tts/tts/utils/generic_utils.py b/mozilla_voice_tts/tts/utils/generic_utils.py index 2d5044ef..fc35840d 100644 --- a/mozilla_voice_tts/tts/utils/generic_utils.py +++ b/mozilla_voice_tts/tts/utils/generic_utils.py @@ -265,6 +265,12 @@ def check_config(c): check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=2, max_val=10) check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1, max_val=1000) + check_argument('gst', c, restricted=True, val_type=dict) + check_argument('gst_style_input', c['gst'], restricted=True, val_type=[str, dict]) + check_argument('gst_embedding_dim', c['gst'], restricted=True, val_type=int, min_val=0, max_val=1000) + check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=2, max_val=10) + check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1, max_val=1000) + # datasets - checking only the first entry check_argument('datasets', c, restricted=True, val_type=list) for dataset_entry in c['datasets']: From 89d338358e5c698876bb5c4a3ffeace98b98c0e4 Mon Sep 17 00:00:00 2001 From: Edresson Date: Wed, 29 Jul 2020 00:49:00 -0300 Subject: [PATCH 27/56] add External Embedding per sample instead of nn.Embedding --- mozilla_voice_tts/bin/train_tts.py | 3 + mozilla_voice_tts/tts/configs/config.json | 18 ++--- mozilla_voice_tts/tts/models/tacotron.py | 19 +++--- mozilla_voice_tts/tts/models/tacotron2.py | 82 +++++++++++++---------- 4 files changed, 65 insertions(+), 57 deletions(-) diff --git a/mozilla_voice_tts/bin/train_tts.py b/mozilla_voice_tts/bin/train_tts.py index 1b9bc032..f7bb0b60 100644 --- a/mozilla_voice_tts/bin/train_tts.py +++ b/mozilla_voice_tts/bin/train_tts.py @@ -523,6 +523,7 @@ def main(args): # pylint: disable=redefined-outer-name "a previously trained model." elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # if start new train using External Embedding file speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) + print(speaker_mapping) speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']) elif c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file: # if start new train using External Embedding file and don't pass external embedding file raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" @@ -558,6 +559,8 @@ def main(args): # pylint: disable=redefined-outer-name # setup criterion criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) + for name, _ in model.named_parameters(): + print(name) if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') diff --git a/mozilla_voice_tts/tts/configs/config.json b/mozilla_voice_tts/tts/configs/config.json index 70529fea..2a61ba03 100644 --- a/mozilla_voice_tts/tts/configs/config.json +++ b/mozilla_voice_tts/tts/configs/config.json @@ -1,5 +1,5 @@ { - "model": "Tacotron", + "model": "Tacotron2", "run_name": "ljspeech-ddc-bn", "run_description": "tacotron2 with ddc and batch-normalization", @@ -114,7 +114,7 @@ "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING - "text_cleaner": "portuguese_cleaners", + "text_cleaner": "phoneme_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_val_loader_workers": 4, // number of evaluation data loader processes. @@ -131,23 +131,19 @@ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages // MULTI-SPEAKER and GST -<<<<<<< HEAD:mozilla_voice_tts/tts/configs/config.json "use_speaker_embedding": true, // use speaker embedding to enable multi-speaker learning. "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 -======= - "use_speaker_embedding": true, // use speaker embedding to enable multi-speaker learning. ->>>>>>> bugfix in DDC now DDC work on Tacotron1:TTS/tts/configs/config.json "use_gst": true, // use global style tokens "gst": { // gst parameter if gst is enabled - "gst_style_input": null, // Condition the style input either on a - // -> wave file [path to wave] or - // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} + "gst_style_input": null, // Condition the style input either on a + // -> wave file [path to wave] or + // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} // with the dictionary being len(dict) <= len(gst_style_tokens). - "gst_embedding_dim": 512, + "gst_embedding_dim": 512, "gst_num_heads": 4, "gst_style_tokens": 10 - }, + }, // DATASETS "datasets": // List of datasets. They all merged and they get different speaker_ids. diff --git a/mozilla_voice_tts/tts/models/tacotron.py b/mozilla_voice_tts/tts/models/tacotron.py index 1395de97..bcc4a2a6 100644 --- a/mozilla_voice_tts/tts/models/tacotron.py +++ b/mozilla_voice_tts/tts/models/tacotron.py @@ -42,13 +42,19 @@ class Tacotron(TacotronAbstract): bidirectional_decoder, double_decoder_consistency, ddc_r, gst) - # init layer dims decoder_in_features = 256 encoder_in_features = 256 - speaker_embedding_dim = 256 - proj_speaker_dim = 80 if num_speakers > 1 else 0 + if speaker_embedding_dim is None: + # if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim + self.embeddings_per_sample = False + speaker_embedding_dim = 256 + else: + # if speaker_embedding_dim is not None we need use speaker embedding per sample + self.embeddings_per_sample = True + + # speaker and gst embeddings is concat in decoder input if num_speakers > 1: decoder_in_features = decoder_in_features + speaker_embedding_dim # add speaker embedding dim if self.gst: @@ -103,14 +109,10 @@ class Tacotron(TacotronAbstract): input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) # B x T_in x embed_dim inputs = self.embedding(characters) - # B x speaker_embed_dim - if speaker_ids is not None: - self.compute_speaker_embedding(speaker_ids) # B x T_in x encoder_in_features encoder_outputs = self.encoder(inputs) # sequence masking encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) - # global style token if self.gst: # B x gst_dim @@ -152,9 +154,6 @@ class Tacotron(TacotronAbstract): @torch.no_grad() def inference(self, characters, speaker_ids=None, style_mel=None, speaker_embeddings=None): inputs = self.embedding(characters) - self._init_states() - if speaker_ids is not None: - self.compute_speaker_embedding(speaker_ids) encoder_outputs = self.encoder(inputs) if self.gst: # B x gst_dim diff --git a/mozilla_voice_tts/tts/models/tacotron2.py b/mozilla_voice_tts/tts/models/tacotron2.py index 47057c56..c2fc8a32 100644 --- a/mozilla_voice_tts/tts/models/tacotron2.py +++ b/mozilla_voice_tts/tts/models/tacotron2.py @@ -33,6 +33,7 @@ class Tacotron2(TacotronAbstract): bidirectional_decoder=False, double_decoder_consistency=False, ddc_r=None, + speaker_embedding_dim=None, gst=False, gst_embedding_dim=512, gst_num_heads=4, @@ -47,20 +48,33 @@ class Tacotron2(TacotronAbstract): ddc_r, gst) # init layer dims - speaker_embedding_dim = 512 if num_speakers > 1 else 0 - gst_embedding_dim = gst_embedding_dim if self.gst else 0 - decoder_in_features = 512+speaker_embedding_dim+gst_embedding_dim - encoder_in_features = 512 if num_speakers > 1 else 512 - proj_speaker_dim = 80 if num_speakers > 1 else 0 + decoder_in_features = 512 + encoder_in_features = 512 + + if speaker_embedding_dim is None: + # if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim + self.embeddings_per_sample = False + speaker_embedding_dim = 512 + else: + # if speaker_embedding_dim is not None we need use speaker embedding per sample + self.embeddings_per_sample = True + + # speaker and gst embeddings is concat in decoder input + if num_speakers > 1: + decoder_in_features = decoder_in_features + speaker_embedding_dim # add speaker embedding dim + if self.gst: + decoder_in_features = decoder_in_features + gst_embedding_dim # add gst embedding dim # embedding layer self.embedding = nn.Embedding(num_chars, 512, padding_idx=0) # speaker embedding layer if num_speakers > 1: - self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim) - self.speaker_embedding.weight.data.normal_(0, 0.3) - + if not self.embeddings_per_sample: + self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim) + self.speaker_embedding.weight.data.normal_(0, 0.3) + + # base model layers self.encoder = Encoder(encoder_in_features) self.decoder = Decoder(decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, @@ -91,7 +105,7 @@ class Tacotron2(TacotronAbstract): mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) return mel_outputs, mel_outputs_postnet, alignments - def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None): + def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None, speaker_embeddings=None): # compute mask for padding # B x T_in_max (boolean) input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) @@ -105,8 +119,13 @@ class Tacotron2(TacotronAbstract): encoder_outputs = self.compute_gst(encoder_outputs, mel_specs) if self.num_speakers > 1: - embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) + if not self.embeddings_per_sample: + # B x 1 x speaker_embed_dim + speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + else: + # B x 1 x speaker_embed_dim + speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) @@ -134,23 +153,18 @@ class Tacotron2(TacotronAbstract): return decoder_outputs, postnet_outputs, alignments, stop_tokens @torch.no_grad() - def inference(self, text, speaker_ids=None, style_mel=None): + def inference(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None): embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder.inference(embedded_inputs) + if self.gst: + # B x gst_dim + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) + if self.num_speakers > 1: - embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] - embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) - if self.gst: - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel) - encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) - else: - encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) - else: - if self.gst: - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel) + if not self.embeddings_per_sample: + speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) decoder_outputs, alignments, stop_tokens = self.decoder.inference( encoder_outputs) @@ -160,25 +174,21 @@ class Tacotron2(TacotronAbstract): decoder_outputs, postnet_outputs, alignments) return decoder_outputs, postnet_outputs, alignments, stop_tokens - def inference_truncated(self, text, speaker_ids=None, style_mel=None): + def inference_truncated(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None): """ Preserve model states for continuous inference """ embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder.inference_truncated(embedded_inputs) + if self.gst: + # B x gst_dim + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) + if self.num_speakers > 1: - embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] - embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) - if self.gst: - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel) - else: - encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) - else: - if self.gst: - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel) + if not self.embeddings_per_sample: + speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated( encoder_outputs) From a2ee48c28e6c4cf9d00063218154e3c3cd2747a4 Mon Sep 17 00:00:00 2001 From: Edresson Date: Wed, 29 Jul 2020 18:22:13 -0300 Subject: [PATCH 28/56] bugfix in GST --- mozilla_voice_tts/bin/train_tts.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mozilla_voice_tts/bin/train_tts.py b/mozilla_voice_tts/bin/train_tts.py index f7bb0b60..daa517b9 100644 --- a/mozilla_voice_tts/bin/train_tts.py +++ b/mozilla_voice_tts/bin/train_tts.py @@ -559,8 +559,6 @@ def main(args): # pylint: disable=redefined-outer-name # setup criterion criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) - for name, _ in model.named_parameters(): - print(name) if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') From f91b9eeda1ab104c8ba6888c2599d7f0fa2e3a35 Mon Sep 17 00:00:00 2001 From: Edresson Date: Wed, 29 Jul 2020 21:21:42 -0300 Subject: [PATCH 29/56] fix Lint check --- mozilla_voice_tts/tts/datasets/preprocess.py | 4 ++-- mozilla_voice_tts/tts/models/tacotron2.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mozilla_voice_tts/tts/datasets/preprocess.py b/mozilla_voice_tts/tts/datasets/preprocess.py index 0509677c..7865652a 100644 --- a/mozilla_voice_tts/tts/datasets/preprocess.py +++ b/mozilla_voice_tts/tts/datasets/preprocess.py @@ -229,14 +229,14 @@ def vctk(root_path, meta_files=None, wavs_path='wav48'): items = [] meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) for meta_file in meta_files: - txt, speaker_id, txt_file = os.path.relpath(meta_file,root_path).split(os.sep) + _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) file_id = txt_file.split('.')[0] if isinstance(test_speakers, list): # if is list ignore this speakers ids if speaker_id in test_speakers: continue with open(meta_file) as file_text: text = file_text.readlines()[0] - wav_file = os.path.join(root_path, wavs_path, speaker_id,file_id+'.wav') + wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id+'.wav') items.append([text, wav_file, speaker_id]) return items \ No newline at end of file diff --git a/mozilla_voice_tts/tts/models/tacotron2.py b/mozilla_voice_tts/tts/models/tacotron2.py index c2fc8a32..2839cea7 100644 --- a/mozilla_voice_tts/tts/models/tacotron2.py +++ b/mozilla_voice_tts/tts/models/tacotron2.py @@ -58,7 +58,7 @@ class Tacotron2(TacotronAbstract): else: # if speaker_embedding_dim is not None we need use speaker embedding per sample self.embeddings_per_sample = True - + # speaker and gst embeddings is concat in decoder input if num_speakers > 1: decoder_in_features = decoder_in_features + speaker_embedding_dim # add speaker embedding dim @@ -73,7 +73,7 @@ class Tacotron2(TacotronAbstract): if not self.embeddings_per_sample: self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) - + # base model layers self.encoder = Encoder(encoder_in_features) self.decoder = Decoder(decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win, From 496a4be3e35e127b6df09033ff3b57a0a00c970e Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 30 Jul 2020 03:51:20 -0300 Subject: [PATCH 30/56] add support for synthesize using variable size external embedding and add bugfix in scipy.io import --- mozilla_voice_tts/bin/train_tts.py | 1 - mozilla_voice_tts/tts/utils/synthesis.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/mozilla_voice_tts/bin/train_tts.py b/mozilla_voice_tts/bin/train_tts.py index daa517b9..1b9bc032 100644 --- a/mozilla_voice_tts/bin/train_tts.py +++ b/mozilla_voice_tts/bin/train_tts.py @@ -523,7 +523,6 @@ def main(args): # pylint: disable=redefined-outer-name "a previously trained model." elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # if start new train using External Embedding file speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) - print(speaker_mapping) speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']) elif c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file: # if start new train using External Embedding file and don't pass external embedding file raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" diff --git a/mozilla_voice_tts/tts/utils/synthesis.py b/mozilla_voice_tts/tts/utils/synthesis.py index 52b33e86..2f746533 100644 --- a/mozilla_voice_tts/tts/utils/synthesis.py +++ b/mozilla_voice_tts/tts/utils/synthesis.py @@ -210,13 +210,10 @@ def synthesis(model, if backend == 'torch': if speaker_id is not None: speaker_id = id_to_torch(speaker_id, cuda=use_cuda) -<<<<<<< HEAD:mozilla_voice_tts/tts/utils/synthesis.py if speaker_embedding is not None: speaker_embedding = embedding_to_torch(speaker_embedding, cuda=use_cuda) -======= ->>>>>>> Added support for Tacotron2 GST + abbility to condition style input with wav or tokens:utils/synthesis.py if not isinstance(style_mel, dict): style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) inputs = numpy_to_torch(inputs, torch.long, cuda=use_cuda) From ae62efd9eb0820fd430514ca2fde3f8b80c0aad2 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 30 Jul 2020 19:48:27 -0300 Subject: [PATCH 31/56] Force the loading of the wav in synthesis using the AP sample rate to avoid breaking the demo --- mozilla_voice_tts/tts/utils/synthesis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mozilla_voice_tts/tts/utils/synthesis.py b/mozilla_voice_tts/tts/utils/synthesis.py index 2f746533..0952c936 100644 --- a/mozilla_voice_tts/tts/utils/synthesis.py +++ b/mozilla_voice_tts/tts/utils/synthesis.py @@ -39,7 +39,7 @@ def numpy_to_tf(np_array, dtype): def compute_style_mel(style_wav, ap, cuda=False): style_mel = torch.FloatTensor(ap.melspectrogram( - ap.load_wav(style_wav))).unsqueeze(0) + ap.load_wav(style_wav, sr=ap.sample_rate))).unsqueeze(0) if cuda: return style_mel.cuda() return style_mel From 27ddef141dc38d38475dd543c2a630768d58e283 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 30 Jul 2020 21:18:46 -0300 Subject: [PATCH 32/56] add gst suport in synthesize --- mozilla_voice_tts/bin/synthesize.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/mozilla_voice_tts/bin/synthesize.py b/mozilla_voice_tts/bin/synthesize.py index 527a3ce9..3de97555 100644 --- a/mozilla_voice_tts/bin/synthesize.py +++ b/mozilla_voice_tts/bin/synthesize.py @@ -18,9 +18,9 @@ from mozilla_voice_tts.utils.io import load_config from mozilla_voice_tts.vocoder.utils.generic_utils import setup_generator -def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None): +def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None, gst_style=None): t_1 = time.time() - waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, None, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding) + waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, gst_style, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding) if CONFIG.model == "Tacotron" and not use_gl: mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T if not use_gl: @@ -84,6 +84,11 @@ if __name__ == "__main__": type=str, help="if CONFIG.use_external_speaker_embedding_file is true, name of speaker embedding reference file present in speakers.json, else target speaker_fileid if the model is multi-speaker.", default=None) + parser.add_argument( + '--gst_style', + help="Wav path file for GST stylereference.", + default=None) + args = parser.parse_args() # load the config @@ -147,7 +152,12 @@ if __name__ == "__main__": else: args.speaker_fileid = None - wav = tts(model, vocoder_model, args.text, C, args.use_cuda, ap, use_griffin_lim, args.speaker_fileid, speaker_embedding=speaker_embedding) + if args.gst_style is None: + gst_style = C.gst['gst_style_input'] + else: + gst_style = args.gst_style + + wav = tts(model, vocoder_model, args.text, C, args.use_cuda, ap, use_griffin_lim, args.speaker_fileid, speaker_embedding=speaker_embedding, gst_style=gst_style) # save the results file_name = args.text.replace(" ", "_") From d038a006020fbab70a0568835e7f2a7c646e5e36 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 30 Jul 2020 21:51:43 -0300 Subject: [PATCH 33/56] add gst style dict suport in synthesize --- mozilla_voice_tts/bin/synthesize.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mozilla_voice_tts/bin/synthesize.py b/mozilla_voice_tts/bin/synthesize.py index 3de97555..1e2886b8 100644 --- a/mozilla_voice_tts/bin/synthesize.py +++ b/mozilla_voice_tts/bin/synthesize.py @@ -155,7 +155,11 @@ if __name__ == "__main__": if args.gst_style is None: gst_style = C.gst['gst_style_input'] else: - gst_style = args.gst_style + # check if gst_style string is a dict, if is dict convert else use string + try: + gst_style = json.loads(args.gst_style) + except: + gst_style = args.gst_style wav = tts(model, vocoder_model, args.text, C, args.use_cuda, ap, use_griffin_lim, args.speaker_fileid, speaker_embedding=speaker_embedding, gst_style=gst_style) From 0fa2544fa5a5c3b9021dcd6e17927379b11fea48 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 30 Jul 2020 22:04:32 -0300 Subject: [PATCH 34/56] bugfix in tacotron and tacotron 2 gst inference --- mozilla_voice_tts/bin/synthesize.py | 2 +- mozilla_voice_tts/tts/models/tacotron.py | 2 +- mozilla_voice_tts/tts/models/tacotron2.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mozilla_voice_tts/bin/synthesize.py b/mozilla_voice_tts/bin/synthesize.py index 1e2886b8..8fa27bf0 100644 --- a/mozilla_voice_tts/bin/synthesize.py +++ b/mozilla_voice_tts/bin/synthesize.py @@ -158,7 +158,7 @@ if __name__ == "__main__": # check if gst_style string is a dict, if is dict convert else use string try: gst_style = json.loads(args.gst_style) - except: + except ValueError: gst_style = args.gst_style wav = tts(model, vocoder_model, args.text, C, args.use_cuda, ap, use_griffin_lim, args.speaker_fileid, speaker_embedding=speaker_embedding, gst_style=gst_style) diff --git a/mozilla_voice_tts/tts/models/tacotron.py b/mozilla_voice_tts/tts/models/tacotron.py index bcc4a2a6..3837e63c 100644 --- a/mozilla_voice_tts/tts/models/tacotron.py +++ b/mozilla_voice_tts/tts/models/tacotron.py @@ -40,7 +40,7 @@ class Tacotron(TacotronAbstract): forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, separate_stopnet, bidirectional_decoder, double_decoder_consistency, - ddc_r, gst) + ddc_r, gst, gst_embedding_dim, gst_num_heads, gst_style_tokens) # init layer dims decoder_in_features = 256 diff --git a/mozilla_voice_tts/tts/models/tacotron2.py b/mozilla_voice_tts/tts/models/tacotron2.py index 2839cea7..9aeeb3d2 100644 --- a/mozilla_voice_tts/tts/models/tacotron2.py +++ b/mozilla_voice_tts/tts/models/tacotron2.py @@ -45,7 +45,7 @@ class Tacotron2(TacotronAbstract): forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, separate_stopnet, bidirectional_decoder, double_decoder_consistency, - ddc_r, gst) + ddc_r, gst, gst_embedding_dim, gst_num_heads, gst_style_tokens) # init layer dims decoder_in_features = 512 From ec2de5c88d1012b6d8ddd5bf84b1d8e1aeeb9748 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 30 Jul 2020 22:27:26 -0300 Subject: [PATCH 35/56] added integrity test for GST dictionary in synthesis --- mozilla_voice_tts/bin/synthesize.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mozilla_voice_tts/bin/synthesize.py b/mozilla_voice_tts/bin/synthesize.py index 8fa27bf0..7d68aef3 100644 --- a/mozilla_voice_tts/bin/synthesize.py +++ b/mozilla_voice_tts/bin/synthesize.py @@ -158,6 +158,8 @@ if __name__ == "__main__": # check if gst_style string is a dict, if is dict convert else use string try: gst_style = json.loads(args.gst_style) + if max(map(int, gst_style.keys())) >= C.gst['gst_style_tokens']: + raise RuntimeError("The highest value of the gst_style dictionary key must be less than the number of GST Tokens, \n Highest dictionary key value: {} \n Number of GST tokens: {}".format(max(map(int, gst_style.keys())), C.gst['gst_style_tokens'])) except ValueError: gst_style = args.gst_style From d87c7227ddf16f5072cbde3da61b043a51756d67 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 30 Jul 2020 22:29:06 -0300 Subject: [PATCH 36/56] add Colab Notebook from TTS Multi-Speaker --- ...illa_TTS_MultiSpeaker_jia_et_al_2018.ipynb | 1758 +++++++++++++++++ 1 file changed, 1758 insertions(+) create mode 100755 notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018.ipynb diff --git a/notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018.ipynb b/notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018.ipynb new file mode 100755 index 00000000..3581b6ab --- /dev/null +++ b/notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018.ipynb @@ -0,0 +1,1758 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Demo-Mozilla-TTS-MultiSpeaker-jia-et-al-2018.ipynb", + "provenance": [], + "collapsed_sections": [ + "vnV-FigfvsS2", + "hkvv7gRcx4WV", + "QJ6VgT2a4vHW" + ] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "yZK6UdwSFnOO", + "colab_type": "text" + }, + "source": [ + "# **Download and install Mozilla TTS**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "yvb0pX3WY6MN", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 139 + }, + "outputId": "5c5016ac-4f92-4b5b-bcf3-473363b2846d" + }, + "source": [ + "import os \n", + "!git clone https://github.com/Edresson/TTS -b dev-gst-embeddings" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Cloning into 'TTS'...\n", + "remote: Enumerating objects: 15, done.\u001b[K\n", + "remote: Counting objects: 6% (1/15)\u001b[K\rremote: Counting objects: 13% (2/15)\u001b[K\rremote: Counting objects: 20% (3/15)\u001b[K\rremote: Counting objects: 26% (4/15)\u001b[K\rremote: Counting objects: 33% (5/15)\u001b[K\rremote: Counting objects: 40% (6/15)\u001b[K\rremote: Counting objects: 46% (7/15)\u001b[K\rremote: Counting objects: 53% (8/15)\u001b[K\rremote: Counting objects: 60% (9/15)\u001b[K\rremote: Counting objects: 66% (10/15)\u001b[K\rremote: Counting objects: 73% (11/15)\u001b[K\rremote: Counting objects: 80% (12/15)\u001b[K\rremote: Counting objects: 86% (13/15)\u001b[K\rremote: Counting objects: 93% (14/15)\u001b[K\rremote: Counting objects: 100% (15/15)\u001b[K\rremote: Counting objects: 100% (15/15), done.\u001b[K\n", + "remote: Compressing objects: 100% (14/14), done.\u001b[K\n", + "remote: Total 10370 (delta 1), reused 3 (delta 1), pack-reused 10355\u001b[K\n", + "Receiving objects: 100% (10370/10370), 120.77 MiB | 33.68 MiB/s, done.\n", + "Resolving deltas: 100% (6183/6183), done.\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "iB9nl2UEG3SY", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "outputId": "36bd08c3-6f29-4c03-9dc6-3cc3cc538e30" + }, + "source": [ + "!apt-get install espeak\n", + "os.chdir('TTS')\n", + "!pip install -r requirements.txt\n", + "!python setup.py develop\n", + "os.chdir('..')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Reading package lists... Done\n", + "Building dependency tree \n", + "Reading state information... Done\n", + "The following package was automatically installed and is no longer required:\n", + " libnvidia-common-440\n", + "Use 'apt autoremove' to remove it.\n", + "The following additional packages will be installed:\n", + " espeak-data libespeak1 libportaudio2 libsonic0\n", + "The following NEW packages will be installed:\n", + " espeak espeak-data libespeak1 libportaudio2 libsonic0\n", + "0 upgraded, 5 newly installed, 0 to remove and 35 not upgraded.\n", + "Need to get 1,219 kB of archives.\n", + "After this operation, 3,031 kB of additional disk space will be used.\n", + "Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libportaudio2 amd64 19.6.0-1 [64.6 kB]\n", + "Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 libsonic0 amd64 0.2.0-6 [13.4 kB]\n", + "Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 espeak-data amd64 1.48.04+dfsg-5 [934 kB]\n", + "Get:4 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libespeak1 amd64 1.48.04+dfsg-5 [145 kB]\n", + "Get:5 http://archive.ubuntu.com/ubuntu bionic/universe amd64 espeak amd64 1.48.04+dfsg-5 [61.6 kB]\n", + "Fetched 1,219 kB in 1s (820 kB/s)\n", + "Selecting previously unselected package libportaudio2:amd64.\n", + "(Reading database ... 144465 files and directories currently installed.)\n", + "Preparing to unpack .../libportaudio2_19.6.0-1_amd64.deb ...\n", + "Unpacking libportaudio2:amd64 (19.6.0-1) ...\n", + "Selecting previously unselected package libsonic0:amd64.\n", + "Preparing to unpack .../libsonic0_0.2.0-6_amd64.deb ...\n", + "Unpacking libsonic0:amd64 (0.2.0-6) ...\n", + "Selecting previously unselected package espeak-data:amd64.\n", + "Preparing to unpack .../espeak-data_1.48.04+dfsg-5_amd64.deb ...\n", + "Unpacking espeak-data:amd64 (1.48.04+dfsg-5) ...\n", + "Selecting previously unselected package libespeak1:amd64.\n", + "Preparing to unpack .../libespeak1_1.48.04+dfsg-5_amd64.deb ...\n", + "Unpacking libespeak1:amd64 (1.48.04+dfsg-5) ...\n", + "Selecting previously unselected package espeak.\n", + "Preparing to unpack .../espeak_1.48.04+dfsg-5_amd64.deb ...\n", + "Unpacking espeak (1.48.04+dfsg-5) ...\n", + "Setting up libportaudio2:amd64 (19.6.0-1) ...\n", + "Setting up espeak-data:amd64 (1.48.04+dfsg-5) ...\n", + "Setting up libsonic0:amd64 (0.2.0-6) ...\n", + "Setting up libespeak1:amd64 (1.48.04+dfsg-5) ...\n", + "Setting up espeak (1.48.04+dfsg-5) ...\n", + "Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n", + "Processing triggers for libc-bin (2.27-3ubuntu1) ...\n", + "/sbin/ldconfig.real: /usr/local/lib/python3.6/dist-packages/ideep4py/lib/libmkldnn.so.0 is not a symbolic link\n", + "\n", + "Requirement already satisfied: torch>=1.5 in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 1)) (1.5.1+cu101)\n", + "Requirement already satisfied: tensorflow>=2.2 in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 2)) (2.2.0)\n", + "Requirement already satisfied: numpy>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 3)) (1.18.5)\n", + "Requirement already satisfied: scipy>=0.19.0 in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 4)) (1.4.1)\n", + "Requirement already satisfied: numba==0.48 in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 5)) (0.48.0)\n", + "Collecting librosa==0.7.2\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/77/b5/1817862d64a7c231afd15419d8418ae1f000742cac275e85c74b219cbccb/librosa-0.7.2.tar.gz (1.6MB)\n", + "\u001b[K |████████████████████████████████| 1.6MB 4.7MB/s \n", + "\u001b[?25hCollecting phonemizer>=2.2.0\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e1/ec/eb95ec96dc5374ba339d47c5252d62e9a7b1fbd5c9e4eefebc0d7e25381d/phonemizer-2.2.1-py3-none-any.whl (49kB)\n", + "\u001b[K |████████████████████████████████| 51kB 9.0MB/s \n", + "\u001b[?25hCollecting unidecode==0.4.20\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/c3/6f/05f5deb753d0594583aa1cc0d2fe9d631d9a00e9b28d0da49f8d3763755b/Unidecode-0.04.20-py2.py3-none-any.whl (228kB)\n", + "\u001b[K |████████████████████████████████| 235kB 30.0MB/s \n", + "\u001b[?25hCollecting attrdict\n", + " Downloading https://files.pythonhosted.org/packages/ef/97/28fe7e68bc7adfce67d4339756e85e9fcf3c6fd7f0c0781695352b70472c/attrdict-2.0.1-py2.py3-none-any.whl\n", + "Collecting tensorboardX\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/af/0c/4f41bcd45db376e6fe5c619c01100e9b7531c55791b7244815bac6eac32c/tensorboardX-2.1-py2.py3-none-any.whl (308kB)\n", + "\u001b[K |████████████████████████████████| 317kB 16.7MB/s \n", + "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 11)) (3.2.2)\n", + "Requirement already satisfied: Pillow in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 12)) (7.0.0)\n", + "Requirement already satisfied: flask in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 13)) (1.1.2)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 14)) (4.41.1)\n", + "Requirement already satisfied: inflect in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 15)) (2.1.0)\n", + "Collecting bokeh==1.4.0\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/de/70/fdd4b186d8570a737372487cc5547aac885a1270626e3ebf03db1808e4ed/bokeh-1.4.0.tar.gz (32.4MB)\n", + "\u001b[K |████████████████████████████████| 32.4MB 100kB/s \n", + "\u001b[?25hCollecting pysbd\n", + " Downloading https://files.pythonhosted.org/packages/3b/49/4799b3cdf80aee5fa4562a3929eda738845900bbeef4ee60481196ad4d1a/pysbd-0.2.3-py3-none-any.whl\n", + "Collecting soundfile\n", + " Downloading https://files.pythonhosted.org/packages/eb/f2/3cbbbf3b96fb9fa91582c438b574cff3f45b29c772f94c400e2c99ef5db9/SoundFile-0.10.3.post1-py2.py3-none-any.whl\n", + "Collecting nose==1.3.7\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/15/d8/dd071918c040f50fa1cf80da16423af51ff8ce4a0f2399b7bf8de45ac3d9/nose-1.3.7-py3-none-any.whl (154kB)\n", + "\u001b[K |████████████████████████████████| 163kB 54.4MB/s \n", + "\u001b[?25hCollecting cardboardlint==1.3.0\n", + " Downloading https://files.pythonhosted.org/packages/a8/d4/02c9ad87226867995e8cc89791ba3a5a653e1d25c04263adabe87b7e1472/cardboardlint-1.3.0.tar.gz\n", + "Collecting pylint==2.5.3\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e8/fb/734960c55474c8f74e6ad4c8588fc44073fb9d69e223269d26a3c2435d16/pylint-2.5.3-py3-none-any.whl (324kB)\n", + "\u001b[K |████████████████████████████████| 327kB 55.3MB/s \n", + "\u001b[?25hCollecting fuzzywuzzy\n", + " Downloading https://files.pythonhosted.org/packages/43/ff/74f23998ad2f93b945c0309f825be92e04e0348e062026998b5eefef4c33/fuzzywuzzy-0.18.0-py2.py3-none-any.whl\n", + "Requirement already satisfied: gdown in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 23)) (3.6.4)\n", + "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch>=1.5->-r requirements.txt (line 1)) (0.16.0)\n", + "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (1.1.0)\n", + "Requirement already satisfied: astunparse==1.6.3 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (1.6.3)\n", + "Requirement already satisfied: absl-py>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (0.9.0)\n", + "Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (3.3.0)\n", + "Requirement already satisfied: six>=1.12.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (1.15.0)\n", + "Requirement already satisfied: protobuf>=3.8.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (3.12.2)\n", + "Requirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (1.12.1)\n", + "Requirement already satisfied: gast==0.3.3 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (0.3.3)\n", + "Requirement already satisfied: keras-preprocessing>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (1.1.2)\n", + "Requirement already satisfied: h5py<2.11.0,>=2.10.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (2.10.0)\n", + "Requirement already satisfied: google-pasta>=0.1.8 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (0.2.0)\n", + "Requirement already satisfied: tensorboard<2.3.0,>=2.2.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (2.2.2)\n", + "Requirement already satisfied: wheel>=0.26; python_version >= \"3\" in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (0.34.2)\n", + "Requirement already satisfied: tensorflow-estimator<2.3.0,>=2.2.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (2.2.0)\n", + "Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (1.30.0)\n", + "Requirement already satisfied: llvmlite<0.32.0,>=0.31.0dev0 in /usr/local/lib/python3.6/dist-packages (from numba==0.48->-r requirements.txt (line 5)) (0.31.0)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from numba==0.48->-r requirements.txt (line 5)) (49.1.0)\n", + "Requirement already satisfied: audioread>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from librosa==0.7.2->-r requirements.txt (line 6)) (2.1.8)\n", + "Requirement already satisfied: scikit-learn!=0.19.0,>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from librosa==0.7.2->-r requirements.txt (line 6)) (0.22.2.post1)\n", + "Requirement already satisfied: joblib>=0.12 in /usr/local/lib/python3.6/dist-packages (from librosa==0.7.2->-r requirements.txt (line 6)) (0.16.0)\n", + "Requirement already satisfied: decorator>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from librosa==0.7.2->-r requirements.txt (line 6)) (4.4.2)\n", + "Requirement already satisfied: resampy>=0.2.2 in /usr/local/lib/python3.6/dist-packages (from librosa==0.7.2->-r requirements.txt (line 6)) (0.2.2)\n", + "Requirement already satisfied: attrs>=18.1 in /usr/local/lib/python3.6/dist-packages (from phonemizer>=2.2.0->-r requirements.txt (line 7)) (19.3.0)\n", + "Collecting segments\n", + " Downloading https://files.pythonhosted.org/packages/5b/a0/0c3fe64787745c39eb3f2f5f5f9ed8d008d9ef22e9d7f9f52f71ea4712f7/segments-2.1.3-py2.py3-none-any.whl\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->-r requirements.txt (line 11)) (0.10.0)\n", + "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->-r requirements.txt (line 11)) (2.8.1)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->-r requirements.txt (line 11)) (2.4.7)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->-r requirements.txt (line 11)) (1.2.0)\n", + "Requirement already satisfied: click>=5.1 in /usr/local/lib/python3.6/dist-packages (from flask->-r requirements.txt (line 13)) (7.1.2)\n", + "Requirement already satisfied: Jinja2>=2.10.1 in /usr/local/lib/python3.6/dist-packages (from flask->-r requirements.txt (line 13)) (2.11.2)\n", + "Requirement already satisfied: itsdangerous>=0.24 in /usr/local/lib/python3.6/dist-packages (from flask->-r requirements.txt (line 13)) (1.1.0)\n", + "Requirement already satisfied: Werkzeug>=0.15 in /usr/local/lib/python3.6/dist-packages (from flask->-r requirements.txt (line 13)) (1.0.1)\n", + "Requirement already satisfied: PyYAML>=3.10 in /usr/local/lib/python3.6/dist-packages (from bokeh==1.4.0->-r requirements.txt (line 16)) (3.13)\n", + "Requirement already satisfied: packaging>=16.8 in /usr/local/lib/python3.6/dist-packages (from bokeh==1.4.0->-r requirements.txt (line 16)) (20.4)\n", + "Requirement already satisfied: tornado>=4.3 in /usr/local/lib/python3.6/dist-packages (from bokeh==1.4.0->-r requirements.txt (line 16)) (5.1.1)\n", + "Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.6/dist-packages (from soundfile->-r requirements.txt (line 18)) (1.14.0)\n", + "Collecting isort<5,>=4.2.5\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e5/b0/c121fd1fa3419ea9bfd55c7f9c4fedfec5143208d8c7ad3ce3db6c623c21/isort-4.3.21-py2.py3-none-any.whl (42kB)\n", + "\u001b[K |████████████████████████████████| 51kB 7.2MB/s \n", + "\u001b[?25hCollecting astroid<=2.5,>=2.4.0\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/24/a8/5133f51967fb21e46ee50831c3f5dda49e976b7f915408d670b1603d41d6/astroid-2.4.2-py3-none-any.whl (213kB)\n", + "\u001b[K |████████████████████████████████| 215kB 55.8MB/s \n", + "\u001b[?25hCollecting toml>=0.7.1\n", + " Downloading https://files.pythonhosted.org/packages/9f/e1/1b40b80f2e1663a6b9f497123c11d7d988c0919abbf3c3f2688e448c5363/toml-0.10.1-py2.py3-none-any.whl\n", + "Collecting mccabe<0.7,>=0.6\n", + " Downloading https://files.pythonhosted.org/packages/87/89/479dc97e18549e21354893e4ee4ef36db1d237534982482c3681ee6e7b57/mccabe-0.6.1-py2.py3-none-any.whl\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from gdown->-r requirements.txt (line 23)) (2.23.0)\n", + "Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (1.17.2)\n", + "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (0.4.1)\n", + "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (1.7.0)\n", + "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (3.2.2)\n", + "Collecting csvw>=1.5.6\n", + " Downloading https://files.pythonhosted.org/packages/d1/b6/8fef6788b8f05b21424a17ae3881eff916d42e5c7e87f57a85d9d7abf0a1/csvw-1.7.0-py2.py3-none-any.whl\n", + "Requirement already satisfied: regex in /usr/local/lib/python3.6/dist-packages (from segments->phonemizer>=2.2.0->-r requirements.txt (line 7)) (2019.12.20)\n", + "Collecting clldutils>=1.7.3\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7b/b3/05882a8d5c8a7f7c69a47500334ac99623928edca930278d6ab88ee6d99b/clldutils-3.5.2-py2.py3-none-any.whl (189kB)\n", + "\u001b[K |████████████████████████████████| 194kB 49.8MB/s \n", + "\u001b[?25hRequirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.6/dist-packages (from Jinja2>=2.10.1->flask->-r requirements.txt (line 13)) (1.1.1)\n", + "Requirement already satisfied: pycparser in /usr/local/lib/python3.6/dist-packages (from cffi>=1.0->soundfile->-r requirements.txt (line 18)) (2.20)\n", + "Collecting lazy-object-proxy==1.4.*\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/0b/dd/b1e3407e9e6913cf178e506cd0dee818e58694d9a5cd1984e3f6a8b9a10f/lazy_object_proxy-1.4.3-cp36-cp36m-manylinux1_x86_64.whl (55kB)\n", + "\u001b[K |████████████████████████████████| 61kB 8.6MB/s \n", + "\u001b[?25hCollecting typed-ast<1.5,>=1.4.0; implementation_name == \"cpython\" and python_version < \"3.8\"\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/90/ed/5459080d95eb87a02fe860d447197be63b6e2b5e9ff73c2b0a85622994f4/typed_ast-1.4.1-cp36-cp36m-manylinux1_x86_64.whl (737kB)\n", + "\u001b[K |████████████████████████████████| 747kB 54.2MB/s \n", + "\u001b[?25hRequirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->gdown->-r requirements.txt (line 23)) (2020.6.20)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->gdown->-r requirements.txt (line 23)) (3.0.4)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->gdown->-r requirements.txt (line 23)) (1.24.3)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->gdown->-r requirements.txt (line 23)) (2.10)\n", + "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (4.1.1)\n", + "Requirement already satisfied: rsa<5,>=3.1.4; python_version >= \"3\" in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (4.6)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (0.2.8)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (1.3.0)\n", + "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from markdown>=2.6.8->tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (1.7.0)\n", + "Collecting isodate\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/9b/9f/b36f7774ff5ea8e428fdcfc4bb332c39ee5b9362ddd3d40d9516a55221b2/isodate-0.6.0-py2.py3-none-any.whl (45kB)\n", + "\u001b[K |████████████████████████████████| 51kB 9.4MB/s \n", + "\u001b[?25hCollecting rfc3986\n", + " Downloading https://files.pythonhosted.org/packages/78/be/7b8b99fd74ff5684225f50dd0e865393d2265656ef3b4ba9eaaaffe622b8/rfc3986-1.4.0-py2.py3-none-any.whl\n", + "Requirement already satisfied: uritemplate>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from csvw>=1.5.6->segments->phonemizer>=2.2.0->-r requirements.txt (line 7)) (3.0.1)\n", + "Collecting colorlog\n", + " Downloading https://files.pythonhosted.org/packages/2a/81/12d77537c82c5d46aa2721dfee25a0e873ef5920ebd0827152f411effb57/colorlog-4.2.1-py2.py3-none-any.whl\n", + "Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.6/dist-packages (from clldutils>=1.7.3->segments->phonemizer>=2.2.0->-r requirements.txt (line 7)) (0.8.7)\n", + "Requirement already satisfied: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<5,>=3.1.4; python_version >= \"3\"->google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (0.4.8)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (3.1.0)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (3.1.0)\n", + "Building wheels for collected packages: librosa, bokeh, cardboardlint\n", + " Building wheel for librosa (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for librosa: filename=librosa-0.7.2-cp36-none-any.whl size=1612885 sha256=772045cdc2b1e22f5b5658822ab21ff12ed23871b729a83f7ab71702bd988fa2\n", + " Stored in directory: /root/.cache/pip/wheels/4c/6e/d7/bb93911540d2d1e44d690a1561871e5b6af82b69e80938abef\n", + " Building wheel for bokeh (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for bokeh: filename=bokeh-1.4.0-cp36-none-any.whl size=23689201 sha256=cad6fa791cc4e4304ea896a3ca0a192ca76b3e19fb6f4b9451c557f5488ff85e\n", + " Stored in directory: /root/.cache/pip/wheels/fb/f8/47/09700d9a19cbcbf0b7a3130690b75c0d6ff80fbda0b1774c7c\n", + " Building wheel for cardboardlint (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for cardboardlint: filename=cardboardlint-1.3.0-cp36-none-any.whl size=51334 sha256=784dae96378ea33bba49caa613246afb18bffc40892c8943e3dbfab99e312f7a\n", + " Stored in directory: /root/.cache/pip/wheels/e6/fe/8e/fbeb2cda877a1f772c0462f9f07eed851f143020947a075ee9\n", + "Successfully built librosa bokeh cardboardlint\n", + "\u001b[31mERROR: panel 0.9.7 has requirement bokeh>=2.1, but you'll have bokeh 1.4.0 which is incompatible.\u001b[0m\n", + "Installing collected packages: soundfile, librosa, isodate, rfc3986, csvw, colorlog, clldutils, segments, phonemizer, unidecode, attrdict, tensorboardX, bokeh, pysbd, nose, cardboardlint, isort, lazy-object-proxy, typed-ast, astroid, toml, mccabe, pylint, fuzzywuzzy\n", + " Found existing installation: librosa 0.6.3\n", + " Uninstalling librosa-0.6.3:\n", + " Successfully uninstalled librosa-0.6.3\n", + " Found existing installation: bokeh 2.1.1\n", + " Uninstalling bokeh-2.1.1:\n", + " Successfully uninstalled bokeh-2.1.1\n", + "Successfully installed astroid-2.4.2 attrdict-2.0.1 bokeh-1.4.0 cardboardlint-1.3.0 clldutils-3.5.2 colorlog-4.2.1 csvw-1.7.0 fuzzywuzzy-0.18.0 isodate-0.6.0 isort-4.3.21 lazy-object-proxy-1.4.3 librosa-0.7.2 mccabe-0.6.1 nose-1.3.7 phonemizer-2.2.1 pylint-2.5.3 pysbd-0.2.3 rfc3986-1.4.0 segments-2.1.3 soundfile-0.10.3.post1 tensorboardX-2.1 toml-0.10.1 typed-ast-1.4.1 unidecode-0.4.20\n", + "running develop\n", + "-- Building version 0.0.4+f0284e8\n", + "running egg_info\n", + "creating TTS.egg-info\n", + "writing TTS.egg-info/PKG-INFO\n", + "writing dependency_links to TTS.egg-info/dependency_links.txt\n", + "writing entry points to TTS.egg-info/entry_points.txt\n", + "writing requirements to TTS.egg-info/requires.txt\n", + "writing top-level names to TTS.egg-info/top_level.txt\n", + "writing manifest file 'TTS.egg-info/SOURCES.txt'\n", + "writing manifest file 'TTS.egg-info/SOURCES.txt'\n", + "running build_ext\n", + "Creating /usr/local/lib/python3.6/dist-packages/TTS.egg-link (link to .)\n", + "Adding TTS 0.0.4+f0284e8 to easy-install.pth file\n", + "Installing tts-server script to /usr/local/bin\n", + "\n", + "Installed /content/TTS\n", + "Processing dependencies for TTS==0.0.4+f0284e8\n", + "Searching for gdown==3.6.4\n", + "Best match: gdown 3.6.4\n", + "Adding gdown 3.6.4 to easy-install.pth file\n", + "Installing gdown script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for fuzzywuzzy==0.18.0\n", + "Best match: fuzzywuzzy 0.18.0\n", + "Adding fuzzywuzzy 0.18.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for pylint==2.5.3\n", + "Best match: pylint 2.5.3\n", + "Adding pylint 2.5.3 to easy-install.pth file\n", + "Installing epylint script to /usr/local/bin\n", + "Installing pylint script to /usr/local/bin\n", + "Installing pyreverse script to /usr/local/bin\n", + "Installing symilar script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for cardboardlint==1.3.0\n", + "Best match: cardboardlint 1.3.0\n", + "Adding cardboardlint 1.3.0 to easy-install.pth file\n", + "Installing cardboardlinter script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for nose==1.3.7\n", + "Best match: nose 1.3.7\n", + "Adding nose 1.3.7 to easy-install.pth file\n", + "Installing nosetests script to /usr/local/bin\n", + "Installing nosetests-3.4 script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for phonemizer==2.2.1\n", + "Best match: phonemizer 2.2.1\n", + "Adding phonemizer 2.2.1 to easy-install.pth file\n", + "Installing phonemize script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for SoundFile==0.10.3.post1\n", + "Best match: SoundFile 0.10.3.post1\n", + "Adding SoundFile 0.10.3.post1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for bokeh==1.4.0\n", + "Best match: bokeh 1.4.0\n", + "Adding bokeh 1.4.0 to easy-install.pth file\n", + "Installing bokeh script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for pysbd==0.2.3\n", + "Best match: pysbd 0.2.3\n", + "Adding pysbd 0.2.3 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for inflect==2.1.0\n", + "Best match: inflect 2.1.0\n", + "Adding inflect 2.1.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for tqdm==4.41.1\n", + "Best match: tqdm 4.41.1\n", + "Adding tqdm 4.41.1 to easy-install.pth file\n", + "Installing tqdm script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for Flask==1.1.2\n", + "Best match: Flask 1.1.2\n", + "Adding Flask 1.1.2 to easy-install.pth file\n", + "Installing flask script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for Pillow==7.0.0\n", + "Best match: Pillow 7.0.0\n", + "Adding Pillow 7.0.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for matplotlib==3.2.2\n", + "Best match: matplotlib 3.2.2\n", + "Adding matplotlib 3.2.2 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for tensorboardX==2.1\n", + "Best match: tensorboardX 2.1\n", + "Adding tensorboardX 2.1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for attrdict==2.0.1\n", + "Best match: attrdict 2.0.1\n", + "Adding attrdict 2.0.1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for Unidecode==0.4.20\n", + "Best match: Unidecode 0.4.20\n", + "Adding Unidecode 0.4.20 to easy-install.pth file\n", + "Installing unidecode script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for librosa==0.7.2\n", + "Best match: librosa 0.7.2\n", + "Adding librosa 0.7.2 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for scipy==1.4.1\n", + "Best match: scipy 1.4.1\n", + "Adding scipy 1.4.1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for numba==0.48.0\n", + "Best match: numba 0.48.0\n", + "Adding numba 0.48.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for numpy==1.18.5\n", + "Best match: numpy 1.18.5\n", + "Adding numpy 1.18.5 to easy-install.pth file\n", + "Installing f2py script to /usr/local/bin\n", + "Installing f2py3 script to /usr/local/bin\n", + "Installing f2py3.6 script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for torch==1.5.1+cu101\n", + "Best match: torch 1.5.1+cu101\n", + "Adding torch 1.5.1+cu101 to easy-install.pth file\n", + "Installing convert-caffe2-to-onnx script to /usr/local/bin\n", + "Installing convert-onnx-to-caffe2 script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for requests==2.23.0\n", + "Best match: requests 2.23.0\n", + "Adding requests 2.23.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for six==1.15.0\n", + "Best match: six 1.15.0\n", + "Adding six 1.15.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for toml==0.10.1\n", + "Best match: toml 0.10.1\n", + "Adding toml 0.10.1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for isort==4.3.21\n", + "Best match: isort 4.3.21\n", + "Adding isort 4.3.21 to easy-install.pth file\n", + "Installing isort script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for mccabe==0.6.1\n", + "Best match: mccabe 0.6.1\n", + "Adding mccabe 0.6.1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for astroid==2.4.2\n", + "Best match: astroid 2.4.2\n", + "Adding astroid 2.4.2 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for PyYAML==3.13\n", + "Best match: PyYAML 3.13\n", + "Adding PyYAML 3.13 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for joblib==0.16.0\n", + "Best match: joblib 0.16.0\n", + "Adding joblib 0.16.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for attrs==19.3.0\n", + "Best match: attrs 19.3.0\n", + "Adding attrs 19.3.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for segments==2.1.3\n", + "Best match: segments 2.1.3\n", + "Adding segments 2.1.3 to easy-install.pth file\n", + "Installing segments script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for cffi==1.14.0\n", + "Best match: cffi 1.14.0\n", + "Adding cffi 1.14.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for packaging==20.4\n", + "Best match: packaging 20.4\n", + "Adding packaging 20.4 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for tornado==5.1.1\n", + "Best match: tornado 5.1.1\n", + "Adding tornado 5.1.1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for Jinja2==2.11.2\n", + "Best match: Jinja2 2.11.2\n", + "Adding Jinja2 2.11.2 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for python-dateutil==2.8.1\n", + "Best match: python-dateutil 2.8.1\n", + "Adding python-dateutil 2.8.1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for click==7.1.2\n", + "Best match: click 7.1.2\n", + "Adding click 7.1.2 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for itsdangerous==1.1.0\n", + "Best match: itsdangerous 1.1.0\n", + "Adding itsdangerous 1.1.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for Werkzeug==1.0.1\n", + "Best match: Werkzeug 1.0.1\n", + "Adding Werkzeug 1.0.1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for cycler==0.10.0\n", + "Best match: cycler 0.10.0\n", + "Adding cycler 0.10.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for kiwisolver==1.2.0\n", + "Best match: kiwisolver 1.2.0\n", + "Adding kiwisolver 1.2.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for pyparsing==2.4.7\n", + "Best match: pyparsing 2.4.7\n", + "Adding pyparsing 2.4.7 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for protobuf==3.12.2\n", + "Best match: protobuf 3.12.2\n", + "Adding protobuf 3.12.2 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for scikit-learn==0.22.2.post1\n", + "Best match: scikit-learn 0.22.2.post1\n", + "Adding scikit-learn 0.22.2.post1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for audioread==2.1.8\n", + "Best match: audioread 2.1.8\n", + "Adding audioread 2.1.8 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for resampy==0.2.2\n", + "Best match: resampy 0.2.2\n", + "Adding resampy 0.2.2 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for decorator==4.4.2\n", + "Best match: decorator 4.4.2\n", + "Adding decorator 4.4.2 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for llvmlite==0.31.0\n", + "Best match: llvmlite 0.31.0\n", + "Adding llvmlite 0.31.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for setuptools==49.1.0\n", + "Best match: setuptools 49.1.0\n", + "Adding setuptools 49.1.0 to easy-install.pth file\n", + "Installing easy_install script to /usr/local/bin\n", + "Installing easy_install-3.8 script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for future==0.16.0\n", + "Best match: future 0.16.0\n", + "Adding future 0.16.0 to easy-install.pth file\n", + "Installing futurize script to /usr/local/bin\n", + "Installing pasteurize script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for chardet==3.0.4\n", + "Best match: chardet 3.0.4\n", + "Adding chardet 3.0.4 to easy-install.pth file\n", + "Installing chardetect script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for urllib3==1.24.3\n", + "Best match: urllib3 1.24.3\n", + "Adding urllib3 1.24.3 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for certifi==2020.6.20\n", + "Best match: certifi 2020.6.20\n", + "Adding certifi 2020.6.20 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for idna==2.10\n", + "Best match: idna 2.10\n", + "Adding idna 2.10 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for wrapt==1.12.1\n", + "Best match: wrapt 1.12.1\n", + "Adding wrapt 1.12.1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for typed-ast==1.4.1\n", + "Best match: typed-ast 1.4.1\n", + "Adding typed-ast 1.4.1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for lazy-object-proxy==1.4.3\n", + "Best match: lazy-object-proxy 1.4.3\n", + "Adding lazy-object-proxy 1.4.3 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for csvw==1.7.0\n", + "Best match: csvw 1.7.0\n", + "Adding csvw 1.7.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for clldutils==3.5.2\n", + "Best match: clldutils 3.5.2\n", + "Adding clldutils 3.5.2 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for regex==2019.12.20\n", + "Best match: regex 2019.12.20\n", + "Adding regex 2019.12.20 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for pycparser==2.20\n", + "Best match: pycparser 2.20\n", + "Adding pycparser 2.20 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for MarkupSafe==1.1.1\n", + "Best match: MarkupSafe 1.1.1\n", + "Adding MarkupSafe 1.1.1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for isodate==0.6.0\n", + "Best match: isodate 0.6.0\n", + "Adding isodate 0.6.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for rfc3986==1.4.0\n", + "Best match: rfc3986 1.4.0\n", + "Adding rfc3986 1.4.0 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for uritemplate==3.0.1\n", + "Best match: uritemplate 3.0.1\n", + "Adding uritemplate 3.0.1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for tabulate==0.8.7\n", + "Best match: tabulate 0.8.7\n", + "Adding tabulate 0.8.7 to easy-install.pth file\n", + "Installing tabulate script to /usr/local/bin\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Searching for colorlog==4.2.1\n", + "Best match: colorlog 4.2.1\n", + "Adding colorlog 4.2.1 to easy-install.pth file\n", + "\n", + "Using /usr/local/lib/python3.6/dist-packages\n", + "Finished processing dependencies for TTS==0.0.4+f0284e8\n", + "Collecting tensorflow==2.3.0rc0\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/8b/68/7c6c8e2b65ad4a3ff5ef658c04a6c2802ff7fe55fc7eecacb6efee1abc40/tensorflow-2.3.0rc0-cp36-cp36m-manylinux2010_x86_64.whl (320.3MB)\n", + "\u001b[K |████████████████████████████████| 320.3MB 53kB/s \n", + "\u001b[?25hRequirement already satisfied: scipy==1.4.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.4.1)\n", + "Requirement already satisfied: protobuf>=3.9.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (3.12.2)\n", + "Requirement already satisfied: gast==0.3.3 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (0.3.3)\n", + "Collecting tf-estimator-nightly<2.3.0.dev2020062302,>=2.3.0.dev2020062301\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/17/3b/fb9aafd734da258411bff2a600cabff65c7d201782318791b72422bd973d/tf_estimator_nightly-2.3.0.dev2020062301-py2.py3-none-any.whl (459kB)\n", + "\u001b[K |████████████████████████████████| 460kB 58.0MB/s \n", + "\u001b[?25hRequirement already satisfied: h5py<2.11.0,>=2.10.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (2.10.0)\n", + "Requirement already satisfied: six>=1.12.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.15.0)\n", + "Requirement already satisfied: numpy<1.19.0,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.18.5)\n", + "Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (3.3.0)\n", + "Requirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (0.34.2)\n", + "Requirement already satisfied: tensorboard<2.3.0,>=2.2.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (2.2.2)\n", + "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.1.0)\n", + "Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.30.0)\n", + "Requirement already satisfied: astunparse==1.6.3 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.6.3)\n", + "Requirement already satisfied: keras-preprocessing<1.2,>=1.1.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.1.2)\n", + "Requirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.12.1)\n", + "Requirement already satisfied: google-pasta>=0.1.8 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (0.2.0)\n", + "Requirement already satisfied: absl-py>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (0.9.0)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf>=3.9.2->tensorflow==2.3.0rc0) (49.1.0)\n", + "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (0.4.1)\n", + "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (1.7.0)\n", + "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (2.23.0)\n", + "Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (1.17.2)\n", + "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (3.2.2)\n", + "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (1.0.1)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (1.3.0)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (2.10)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (2020.6.20)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (1.24.3)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (3.0.4)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (0.2.8)\n", + "Requirement already satisfied: rsa<5,>=3.1.4; python_version >= \"3\" in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (4.6)\n", + "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (4.1.1)\n", + "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from markdown>=2.6.8->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (1.7.0)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (3.1.0)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (0.4.8)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (3.1.0)\n", + "Installing collected packages: tf-estimator-nightly, tensorflow\n", + " Found existing installation: tensorflow 2.2.0\n", + " Uninstalling tensorflow-2.2.0:\n", + " Successfully uninstalled tensorflow-2.2.0\n", + "Successfully installed tensorflow-2.3.0rc0 tf-estimator-nightly-2.3.0.dev2020062301\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w6Krn8k1inC_", + "colab_type": "text" + }, + "source": [ + "\n", + "\n", + "**Download Checkpoint**\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "PiYHf3lKhi9z", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 104 + }, + "outputId": "75f7ecd0-3ee2-4ed2-8150-49f71a5ef4d5" + }, + "source": [ + "!wget -c -q --show-progress -O ./TTS-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/Checkpoints-TTS-MultiSpeaker-Jia-et-al-2018.zip\n", + "!unzip ./TTS-checkpoint.zip\n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "./TTS-checkpoint.zi 100%[===================>] 367.68M 31.9MB/s in 12s \n", + "Archive: ./TTS-checkpoint.zip\n", + " inflating: best_model.pth.tar \n", + " inflating: speakers.json \n", + " inflating: config.json \n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MpYNgqrZcJKn", + "colab_type": "text" + }, + "source": [ + "**Utils Functions**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "4KZA4b_CbMqx", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 52 + }, + "outputId": "c5d33fac-4682-4e72-8fc2-c1aa83408ef4" + }, + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "import argparse\n", + "import json\n", + "# pylint: disable=redefined-outer-name, unused-argument\n", + "import os\n", + "import string\n", + "import time\n", + "import sys\n", + "import numpy as np\n", + "\n", + "TTS_PATH = \"../content/TTS\"\n", + "# add libraries into environment\n", + "sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n", + "\n", + "import torch\n", + "\n", + "from TTS.tts.utils.generic_utils import setup_model\n", + "from TTS.tts.utils.synthesis import synthesis\n", + "from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols\n", + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.utils.io import load_config\n", + "from TTS.vocoder.utils.generic_utils import setup_generator\n", + "\n", + "\n", + "def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None):\n", + " t_1 = time.time()\n", + " waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, None, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)\n", + " if CONFIG.model == \"Tacotron\" and not use_gl:\n", + " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", + " if not use_gl:\n", + " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", + " if use_cuda and not use_gl:\n", + " waveform = waveform.cpu()\n", + " if not use_gl:\n", + " waveform = waveform.numpy()\n", + " waveform = waveform.squeeze()\n", + " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", + " tps = (time.time() - t_1) / len(waveform)\n", + " print(\" > Run-time: {}\".format(time.time() - t_1))\n", + " print(\" > Real-time factor: {}\".format(rtf))\n", + " print(\" > Time per step: {}\".format(tps))\n", + " return waveform\n", + "\n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ENA2OumIVeMA", + "colab_type": "text" + }, + "source": [ + "# **Vars definitions**\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "jPD0d_XpVXmY", + "colab_type": "code", + "colab": {} + }, + "source": [ + "TEXT = ''\n", + "OUT_PATH = 'tests-audios/'\n", + "# create output path\n", + "os.makedirs(OUT_PATH, exist_ok=True)\n", + "\n", + "SPEAKER_FILEID = None # if None use the first embedding from speakers.json\n", + "\n", + "# model vars \n", + "MODEL_PATH = 'best_model.pth.tar'\n", + "CONFIG_PATH = 'config.json'\n", + "SPEAKER_JSON = 'speakers.json'\n", + "\n", + "# vocoder vars\n", + "VOCODER_PATH = ''\n", + "VOCODER_CONFIG_PATH = ''\n", + "\n", + "USE_CUDA = True" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dV6cXXlfi72r", + "colab_type": "text" + }, + "source": [ + "# **Restore TTS Model**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "x1WgLFauWUPe", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 486 + }, + "outputId": "7cc2d0bd-f3c8-41dd-9f17-438f77c1f2d2" + }, + "source": [ + "# load the config\n", + "C = load_config(CONFIG_PATH)\n", + "C.forward_attn_mask = True\n", + "\n", + "# load the audio processor\n", + "ap = AudioProcessor(**C.audio)\n", + "\n", + "# if the vocabulary was passed, replace the default\n", + "if 'characters' in C.keys():\n", + " symbols, phonemes = make_symbols(**C.characters)\n", + "\n", + "speaker_embedding = None\n", + "speaker_embedding_dim = None\n", + "num_speakers = 0\n", + "# load speakers\n", + "if SPEAKER_JSON != '':\n", + " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", + " num_speakers = len(speaker_mapping)\n", + " if C.use_external_speaker_embedding_file:\n", + " if SPEAKER_FILEID is not None:\n", + " speaker_embedding = speaker_mapping[SPEAKER_FILEID]['embedding']\n", + " else: # if speaker_fileid is not specificated use the first sample in speakers.json\n", + " choise_speaker = list(speaker_mapping.keys())[0]\n", + " print(\" Speaker: \",choise_speaker.split('_')[0],'was chosen automatically', \"(this speaker seen in training)\")\n", + " speaker_embedding = speaker_mapping[choise_speaker]['embedding']\n", + " speaker_embedding_dim = len(speaker_embedding)\n", + "\n", + "# load the model\n", + "num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n", + "model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)\n", + "cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n", + "model.load_state_dict(cp['model'])\n", + "model.eval()\n", + "\n", + "if USE_CUDA:\n", + " model.cuda()\n", + "\n", + "model.decoder.set_r(cp['r'])\n", + "\n", + "# load vocoder model\n", + "if VOCODER_PATH!= \"\":\n", + " VC = load_config(VOCODER_CONFIG_PATH)\n", + " vocoder_model = setup_generator(VC)\n", + " vocoder_model.load_state_dict(torch.load(VOCODER_PATH, map_location=\"cpu\")[\"model\"])\n", + " vocoder_model.remove_weight_norm()\n", + " if USE_CUDA:\n", + " vocoder_model.cuda()\n", + " vocoder_model.eval()\n", + "else:\n", + " vocoder_model = None\n", + " VC = None\n", + "\n", + "# synthesize voice\n", + "use_griffin_lim = VOCODER_PATH== \"\"\n", + "\n", + "if not C.use_external_speaker_embedding_file:\n", + " if SPEAKER_FILEID.isdigit():\n", + " SPEAKER_FILEID = int(SPEAKER_FILEID)\n", + " else:\n", + " SPEAKER_FILEID = None\n", + "else:\n", + " SPEAKER_FILEID = None\n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + " > Setting up Audio Processor...\n", + " | > sample_rate:22050\n", + " | > num_mels:80\n", + " | > min_level_db:-100\n", + " | > frame_shift_ms:None\n", + " | > frame_length_ms:None\n", + " | > ref_level_db:20\n", + " | > fft_size:1024\n", + " | > power:1.5\n", + " | > preemphasis:0.98\n", + " | > griffin_lim_iters:60\n", + " | > signal_norm:True\n", + " | > symmetric_norm:True\n", + " | > mel_fmin:0\n", + " | > mel_fmax:8000.0\n", + " | > spec_gain:20.0\n", + " | > stft_pad_mode:constant\n", + " | > max_norm:4.0\n", + " | > clip_norm:True\n", + " | > do_trim_silence:False\n", + " | > trim_db:60\n", + " | > do_sound_norm:False\n", + " | > stats_path:None\n", + " | > hop_length:256\n", + " | > win_length:1024\n", + " Speaker: p262 was chosen automatically (this speaker seen in training)\n", + " > Using model: Tacotron2\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNvVEoE30qY6", + "colab_type": "text" + }, + "source": [ + "Synthesize sentence with Speaker\n", + "\n", + "> Stop running the cell to leave!\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "2o8fXkVSyXOa", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 704 + }, + "outputId": "b093ab6c-57a9-4595-8ff6-edee226cb0fd" + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with Speaker: \",choise_speaker.split('_')[0], \"(this speaker seen in training)\")\n", + "while True:\n", + " TEXT = input(\"Enter sentence: \")\n", + " print(\" > Text: {}\".format(TEXT))\n", + " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n", + " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + " # save the results\n", + " file_name = TEXT.replace(\" \", \"_\")\n", + " file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + " out_path = os.path.join(OUT_PATH, file_name)\n", + " print(\" > Saving output to {}\".format(out_path))\n", + " ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Synthesize sentence with Speaker: p262 (this speaker seen in training)\n", + "Enter sentence: teste this demonstration :)\n", + " > Text: teste this demonstration :)\n", + " > Run-time: 2.143589496612549\n", + " > Real-time factor: 1.6784820231524382\n", + " > Time per step: 7.61217437684536e-05\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "stream", + "text": [ + " > Saving output to tests-audios/teste_this_demonstration_.wav\n" + ], + "name": "stdout" + }, + { + "output_type": "error", + "ename": "KeyboardInterrupt", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36m_input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 728\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 729\u001b[0;31m \u001b[0mident\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreply\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstdin_socket\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 730\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/jupyter_client/session.py\u001b[0m in \u001b[0;36mrecv\u001b[0;34m(self, socket, mode, content, copy)\u001b[0m\n\u001b[1;32m 802\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 803\u001b[0;31m \u001b[0mmsg_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msocket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_multipart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 804\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mzmq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mZMQError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/zmq/sugar/socket.py\u001b[0m in \u001b[0;36mrecv_multipart\u001b[0;34m(self, flags, copy, track)\u001b[0m\n\u001b[1;32m 474\u001b[0m \"\"\"\n\u001b[0;32m--> 475\u001b[0;31m \u001b[0mparts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mflags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrack\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrack\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 476\u001b[0m \u001b[0;31m# have first part already, only loop while more to receive\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket.Socket.recv\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket.Socket.recv\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket._recv_copy\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/zmq/backend/cython/checkrc.pxd\u001b[0m in \u001b[0;36mzmq.backend.cython.checkrc._check_rc\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: ", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Synthesize sentence with Speaker: \"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mchoise_speaker\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'_'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"(this speaker seen in training)\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mTEXT\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Enter sentence: \"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\" > Text: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mTEXT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvocoder_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTEXT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mUSE_CUDA\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_griffin_lim\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mSPEAKER_FILEID\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_embedding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_embedding\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36mraw_input\u001b[0;34m(self, prompt)\u001b[0m\n\u001b[1;32m 702\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent_ident\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 703\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent_header\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 704\u001b[0;31m \u001b[0mpassword\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 705\u001b[0m )\n\u001b[1;32m 706\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36m_input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 732\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 733\u001b[0m \u001b[0;31m# re-raise KeyboardInterrupt, to truncate traceback\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 734\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 735\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 736\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vnV-FigfvsS2", + "colab_type": "text" + }, + "source": [ + "# **Select Speaker**\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "RuCGOnJ_fgDV", + "colab_type": "code", + "colab": {} + }, + "source": [ + "\n", + "# VCTK speakers not seen in training (new speakers)\n", + "VCTK_test_Speakers = [\"p225\", \"p234\", \"p238\", \"p245\", \"p248\", \"p261\", \"p294\", \"p302\", \"p326\", \"p335\", \"p347\"]\n", + "\n", + "# VCTK speakers seen in training\n", + "VCTK_train_Speakers = ['p244', 'p300', 'p303', 'p273', 'p292', 'p252', 'p254', 'p269', 'p345', 'p274', 'p363', 'p285', 'p351', 'p361', 'p295', 'p266', 'p307', 'p230', 'p339', 'p253', 'p310', 'p241', 'p256', 'p323', 'p237', 'p229', 'p298', 'p336', 'p276', 'p305', 'p255', 'p278', 'p299', 'p265', 'p267', 'p280', 'p260', 'p272', 'p262', 'p334', 'p283', 'p247', 'p246', 'p374', 'p297', 'p249', 'p250', 'p304', 'p240', 'p236', 'p312', 'p286', 'p263', 'p258', 'p313', 'p376', 'p279', 'p340', 'p362', 'p284', 'p231', 'p308', 'p277', 'p275', 'p333', 'p314', 'p330', 'p264', 'p226', 'p288', 'p343', 'p239', 'p232', 'p268', 'p270', 'p329', 'p227', 'p271', 'p228', 'p311', 'p301', 'p293', 'p364', 'p251', 'p317', 'p360', 'p281', 'p243', 'p287', 'p233', 'p259', 'p316', 'p257', 'p282', 'p306', 'p341', 'p318']\n", + "\n", + "\n", + "num_samples_speaker = 2 # In theory the more samples of the speaker the more similar to the real voice it will be!\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hkvv7gRcx4WV", + "colab_type": "text" + }, + "source": [ + "## **Example select a VCTK seen speaker in training**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "BviNMI9UyCYz", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# get embedding\n", + "Speaker_choise = VCTK_train_Speakers[0] # choise one of training speakers\n", + "# load speakers\n", + "if SPEAKER_JSON != '':\n", + " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", + " if C.use_external_speaker_embedding_file:\n", + " speaker_embeddings = []\n", + " for key in list(speaker_mapping.keys()):\n", + " if Speaker_choise in key:\n", + " if len(speaker_embeddings) < num_samples_speaker:\n", + " speaker_embeddings.append(speaker_mapping[key]['embedding'])\n", + " # takes the average of the embedings samples of the announcers\n", + " speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n", + " " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "5e5_XnLsx3jg", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 704 + }, + "outputId": "3c82a3cd-f4b9-4493-aa4c-275c34d3294d" + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker seen in training)\")\n", + "while True:\n", + " TEXT = input(\"Enter sentence: \")\n", + " print(\" > Text: {}\".format(TEXT))\n", + " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n", + " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + " # save the results\n", + " file_name = TEXT.replace(\" \", \"_\")\n", + " file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + " out_path = os.path.join(OUT_PATH, file_name)\n", + " print(\" > Saving output to {}\".format(out_path))\n", + " ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Synthesize sentence with Speaker: p244 (this speaker seen in training)\n", + "Enter sentence: Test this demonstration \n", + " > Text: Test this demonstration \n", + " > Run-time: 1.06947922706604\n", + " > Real-time factor: 0.8609054588373298\n", + " > Time per step: 3.904344461788641e-05\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "stream", + "text": [ + " > Saving output to tests-audios/Test_this_demonstration_.wav\n" + ], + "name": "stdout" + }, + { + "output_type": "error", + "ename": "KeyboardInterrupt", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36m_input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 728\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 729\u001b[0;31m \u001b[0mident\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreply\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstdin_socket\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 730\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/jupyter_client/session.py\u001b[0m in \u001b[0;36mrecv\u001b[0;34m(self, socket, mode, content, copy)\u001b[0m\n\u001b[1;32m 802\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 803\u001b[0;31m \u001b[0mmsg_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msocket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_multipart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 804\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mzmq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mZMQError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/zmq/sugar/socket.py\u001b[0m in \u001b[0;36mrecv_multipart\u001b[0;34m(self, flags, copy, track)\u001b[0m\n\u001b[1;32m 474\u001b[0m \"\"\"\n\u001b[0;32m--> 475\u001b[0;31m \u001b[0mparts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mflags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrack\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrack\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 476\u001b[0m \u001b[0;31m# have first part already, only loop while more to receive\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket.Socket.recv\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket.Socket.recv\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket._recv_copy\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/zmq/backend/cython/checkrc.pxd\u001b[0m in \u001b[0;36mzmq.backend.cython.checkrc._check_rc\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: ", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Synthesize sentence with Speaker: \"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mSpeaker_choise\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'_'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"(this speaker seen in training)\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mTEXT\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Enter sentence: \"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\" > Text: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mTEXT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvocoder_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTEXT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mUSE_CUDA\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_griffin_lim\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mSPEAKER_FILEID\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_embedding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_embedding\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36mraw_input\u001b[0;34m(self, prompt)\u001b[0m\n\u001b[1;32m 702\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent_ident\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 703\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent_header\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 704\u001b[0;31m \u001b[0mpassword\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 705\u001b[0m )\n\u001b[1;32m 706\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36m_input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 732\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 733\u001b[0m \u001b[0;31m# re-raise KeyboardInterrupt, to truncate traceback\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 734\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 735\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 736\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "QJ6VgT2a4vHW" + }, + "source": [ + "## **Example select a VCTK not seen speaker in training (new Speakers)**\n", + "\n", + "\n", + "> Fitting new Speakers :)\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "SZS57ZK-4vHa", + "colab": {} + }, + "source": [ + "# get embedding\n", + "Speaker_choise = VCTK_test_Speakers[0] # choise one of training speakers\n", + "# load speakers\n", + "if SPEAKER_JSON != '':\n", + " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", + " if C.use_external_speaker_embedding_file:\n", + " speaker_embeddings = []\n", + " for key in list(speaker_mapping.keys()):\n", + " if Speaker_choise in key:\n", + " if len(speaker_embeddings) < num_samples_speaker:\n", + " speaker_embeddings.append(speaker_mapping[key]['embedding'])\n", + " # takes the average of the embedings samples of the announcers\n", + " speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n", + " " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "bbs85vzz4vHo", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 704 + }, + "outputId": "3c82a3cd-f4b9-4493-aa4c-275c34d3294d" + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", + "while True:\n", + " TEXT = input(\"Enter sentence: \")\n", + " print(\" > Text: {}\".format(TEXT))\n", + " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n", + " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + " # save the results\n", + " file_name = TEXT.replace(\" \", \"_\")\n", + " file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + " out_path = os.path.join(OUT_PATH, file_name)\n", + " print(\" > Saving output to {}\".format(out_path))\n", + " ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Synthesize sentence with Speaker: p244 (this speaker seen in training)\n", + "Enter sentence: Test this demonstration \n", + " > Text: Test this demonstration \n", + " > Run-time: 1.06947922706604\n", + " > Real-time factor: 0.8609054588373298\n", + " > Time per step: 3.904344461788641e-05\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "stream", + "text": [ + " > Saving output to tests-audios/Test_this_demonstration_.wav\n" + ], + "name": "stdout" + }, + { + "output_type": "error", + "ename": "KeyboardInterrupt", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36m_input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 728\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 729\u001b[0;31m \u001b[0mident\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreply\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstdin_socket\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 730\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/jupyter_client/session.py\u001b[0m in \u001b[0;36mrecv\u001b[0;34m(self, socket, mode, content, copy)\u001b[0m\n\u001b[1;32m 802\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 803\u001b[0;31m \u001b[0mmsg_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msocket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_multipart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 804\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mzmq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mZMQError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/zmq/sugar/socket.py\u001b[0m in \u001b[0;36mrecv_multipart\u001b[0;34m(self, flags, copy, track)\u001b[0m\n\u001b[1;32m 474\u001b[0m \"\"\"\n\u001b[0;32m--> 475\u001b[0;31m \u001b[0mparts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mflags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrack\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrack\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 476\u001b[0m \u001b[0;31m# have first part already, only loop while more to receive\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket.Socket.recv\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket.Socket.recv\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket._recv_copy\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/zmq/backend/cython/checkrc.pxd\u001b[0m in \u001b[0;36mzmq.backend.cython.checkrc._check_rc\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: ", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Synthesize sentence with Speaker: \"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mSpeaker_choise\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'_'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"(this speaker seen in training)\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mTEXT\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Enter sentence: \"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\" > Text: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mTEXT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvocoder_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTEXT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mUSE_CUDA\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_griffin_lim\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mSPEAKER_FILEID\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_embedding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_embedding\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36mraw_input\u001b[0;34m(self, prompt)\u001b[0m\n\u001b[1;32m 702\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent_ident\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 703\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent_header\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 704\u001b[0;31m \u001b[0mpassword\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 705\u001b[0m )\n\u001b[1;32m 706\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36m_input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 732\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 733\u001b[0m \u001b[0;31m# re-raise KeyboardInterrupt, to truncate traceback\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 734\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 735\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 736\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "LEE6mQLh5Who" + }, + "source": [ + "# **Example Synthesizing with your own voice :)**\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "La70gSB65nrs", + "colab_type": "text" + }, + "source": [ + " Download and load GE2E Speaker Encoder " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "r0IEFZ0B5vQg", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 121 + }, + "outputId": "93ee404d-3fdc-48ac-f867-9ef7f7ecaa0c" + }, + "source": [ + "!wget -c -q --show-progress -O ./SpeakerEncoder-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/GE2E-SpeakerEncoder-iter25k.zip\n", + "!unzip ./SpeakerEncoder-checkpoint.zip" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "./SpeakerEncoder-ch 100%[===================>] 25.35M 25.1MB/s in 1.0s \n", + "Archive: ./SpeakerEncoder-checkpoint.zip\n", + " creating: GE2E-SpeakerEncoder/\n", + " inflating: GE2E-SpeakerEncoder/best_model.pth.tar \n", + " inflating: GE2E-SpeakerEncoder/config.json \n", + " inflating: GE2E-SpeakerEncoder/README.MD \n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "jEH8HCTh5mF6", + "colab_type": "code", + "colab": {} + }, + "source": [ + "SE_MODEL_RUN_PATH = \"GE2E-SpeakerEncoder/\"\n", + "SE_MODEL_PATH = os.path.join(SE_MODEL_RUN_PATH, \"best_model.pth.tar\")\n", + "SE_CONFIG_PATH =os.path.join(SE_MODEL_RUN_PATH, \"config.json\")\n", + "USE_CUDA = True" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "tOwkfQqT6-Qo", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 451 + }, + "outputId": "091c9020-302a-429e-bc7e-7fd2bb6f68f9" + }, + "source": [ + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.speaker_encoder.model import SpeakerEncoder\n", + "se_config = load_config(SE_CONFIG_PATH)\n", + "se_ap = AudioProcessor(**se_config['audio'])\n", + "\n", + "se_model = SpeakerEncoder(**se_config.model)\n", + "se_model.load_state_dict(torch.load(SE_MODEL_PATH)['model'])\n", + "se_model.eval()\n", + "if USE_CUDA:\n", + " se_model.cuda()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + " > Setting up Audio Processor...\n", + " | > sample_rate:16000\n", + " | > num_mels:40\n", + " | > min_level_db:-100\n", + " | > frame_shift_ms:12.5\n", + " | > frame_length_ms:50\n", + " | > ref_level_db:20\n", + " | > fft_size:1024\n", + " | > power:None\n", + " | > preemphasis:0.98\n", + " | > griffin_lim_iters:None\n", + " | > signal_norm:True\n", + " | > symmetric_norm:True\n", + " | > mel_fmin:0\n", + " | > mel_fmax:8000.0\n", + " | > spec_gain:20.0\n", + " | > stft_pad_mode:reflect\n", + " | > max_norm:4.0\n", + " | > clip_norm:True\n", + " | > do_trim_silence:False\n", + " | > trim_db:60\n", + " | > do_sound_norm:False\n", + " | > stats_path:None\n", + " | > hop_length:200\n", + " | > win_length:800\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0TLlbUFG8O36", + "colab_type": "text" + }, + "source": [ + "Upload a wav audio file in your voice.\n", + "\n", + "\n", + "> We recommend files longer than 3 seconds, the bigger the file the closer to your voice :)\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "_FWwHPjJ8NXl", + "colab_type": "code", + "colab": { + "resources": { + "http://localhost:8080/nbextensions/google.colab/files.js": { + "data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7CgpmdW5jdGlvbiBfdXBsb2FkRmlsZXMoaW5wdXRJZCwgb3V0cHV0SWQpIHsKICBjb25zdCBzdGVwcyA9IHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCk7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICAvLyBDYWNoZSBzdGVwcyBvbiB0aGUgb3V0cHV0RWxlbWVudCB0byBtYWtlIGl0IGF2YWlsYWJsZSBmb3IgdGhlIG5leHQgY2FsbAogIC8vIHRvIHVwbG9hZEZpbGVzQ29udGludWUgZnJvbSBQeXRob24uCiAgb3V0cHV0RWxlbWVudC5zdGVwcyA9IHN0ZXBzOwoKICByZXR1cm4gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpOwp9CgovLyBUaGlzIGlzIHJvdWdobHkgYW4gYXN5bmMgZ2VuZXJhdG9yIChub3Qgc3VwcG9ydGVkIGluIHRoZSBicm93c2VyIHlldCksCi8vIHdoZXJlIHRoZXJlIGFyZSBtdWx0aXBsZSBhc3luY2hyb25vdXMgc3RlcHMgYW5kIHRoZSBQeXRob24gc2lkZSBpcyBnb2luZwovLyB0byBwb2xsIGZvciBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcC4KLy8gVGhpcyB1c2VzIGEgUHJvbWlzZSB0byBibG9jayB0aGUgcHl0aG9uIHNpZGUgb24gY29tcGxldGlvbiBvZiBlYWNoIHN0ZXAsCi8vIHRoZW4gcGFzc2VzIHRoZSByZXN1bHQgb2YgdGhlIHByZXZpb3VzIHN0ZXAgYXMgdGhlIGlucHV0IHRvIHRoZSBuZXh0IHN0ZXAuCmZ1bmN0aW9uIF91cGxvYWRGaWxlc0NvbnRpbnVlKG91dHB1dElkKSB7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICBjb25zdCBzdGVwcyA9IG91dHB1dEVsZW1lbnQuc3RlcHM7CgogIGNvbnN0IG5leHQgPSBzdGVwcy5uZXh0KG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSk7CiAgcmV0dXJuIFByb21pc2UucmVzb2x2ZShuZXh0LnZhbHVlLnByb21pc2UpLnRoZW4oKHZhbHVlKSA9PiB7CiAgICAvLyBDYWNoZSB0aGUgbGFzdCBwcm9taXNlIHZhbHVlIHRvIG1ha2UgaXQgYXZhaWxhYmxlIHRvIHRoZSBuZXh0CiAgICAvLyBzdGVwIG9mIHRoZSBnZW5lcmF0b3IuCiAgICBvdXRwdXRFbGVtZW50Lmxhc3RQcm9taXNlVmFsdWUgPSB2YWx1ZTsKICAgIHJldHVybiBuZXh0LnZhbHVlLnJlc3BvbnNlOwogIH0pOwp9CgovKioKICogR2VuZXJhdG9yIGZ1bmN0aW9uIHdoaWNoIGlzIGNhbGxlZCBiZXR3ZWVuIGVhY2ggYXN5bmMgc3RlcCBvZiB0aGUgdXBsb2FkCiAqIHByb2Nlc3MuCiAqIEBwYXJhbSB7c3RyaW5nfSBpbnB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIGlucHV0IGZpbGUgcGlja2VyIGVsZW1lbnQuCiAqIEBwYXJhbSB7c3RyaW5nfSBvdXRwdXRJZCBFbGVtZW50IElEIG9mIHRoZSBvdXRwdXQgZGlzcGxheS4KICogQHJldHVybiB7IUl0ZXJhYmxlPCFPYmplY3Q+fSBJdGVyYWJsZSBvZiBuZXh0IHN0ZXBzLgogKi8KZnVuY3Rpb24qIHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IGlucHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKGlucHV0SWQpOwogIGlucHV0RWxlbWVudC5kaXNhYmxlZCA9IGZhbHNlOwoKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIG91dHB1dEVsZW1lbnQuaW5uZXJIVE1MID0gJyc7CgogIGNvbnN0IHBpY2tlZFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgaW5wdXRFbGVtZW50LmFkZEV2ZW50TGlzdGVuZXIoJ2NoYW5nZScsIChlKSA9PiB7CiAgICAgIHJlc29sdmUoZS50YXJnZXQuZmlsZXMpOwogICAgfSk7CiAgfSk7CgogIGNvbnN0IGNhbmNlbCA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2J1dHRvbicpOwogIGlucHV0RWxlbWVudC5wYXJlbnRFbGVtZW50LmFwcGVuZENoaWxkKGNhbmNlbCk7CiAgY2FuY2VsLnRleHRDb250ZW50ID0gJ0NhbmNlbCB1cGxvYWQnOwogIGNvbnN0IGNhbmNlbFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgY2FuY2VsLm9uY2xpY2sgPSAoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9OwogIH0pOwoKICAvLyBXYWl0IGZvciB0aGUgdXNlciB0byBwaWNrIHRoZSBmaWxlcy4KICBjb25zdCBmaWxlcyA9IHlpZWxkIHsKICAgIHByb21pc2U6IFByb21pc2UucmFjZShbcGlja2VkUHJvbWlzZSwgY2FuY2VsUHJvbWlzZV0pLAogICAgcmVzcG9uc2U6IHsKICAgICAgYWN0aW9uOiAnc3RhcnRpbmcnLAogICAgfQogIH07CgogIGNhbmNlbC5yZW1vdmUoKTsKCiAgLy8gRGlzYWJsZSB0aGUgaW5wdXQgZWxlbWVudCBzaW5jZSBmdXJ0aGVyIHBpY2tzIGFyZSBub3QgYWxsb3dlZC4KICBpbnB1dEVsZW1lbnQuZGlzYWJsZWQgPSB0cnVlOwoKICBpZiAoIWZpbGVzKSB7CiAgICByZXR1cm4gewogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgICAgfQogICAgfTsKICB9CgogIGZvciAoY29uc3QgZmlsZSBvZiBmaWxlcykgewogICAgY29uc3QgbGkgPSBkb2N1bWVudC5jcmVhdGVFbGVtZW50KCdsaScpOwogICAgbGkuYXBwZW5kKHNwYW4oZmlsZS5uYW1lLCB7Zm9udFdlaWdodDogJ2JvbGQnfSkpOwogICAgbGkuYXBwZW5kKHNwYW4oCiAgICAgICAgYCgke2ZpbGUudHlwZSB8fCAnbi9hJ30pIC0gJHtmaWxlLnNpemV9IGJ5dGVzLCBgICsKICAgICAgICBgbGFzdCBtb2RpZmllZDogJHsKICAgICAgICAgICAgZmlsZS5sYXN0TW9kaWZpZWREYXRlID8gZmlsZS5sYXN0TW9kaWZpZWREYXRlLnRvTG9jYWxlRGF0ZVN0cmluZygpIDoKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgJ24vYSd9IC0gYCkpOwogICAgY29uc3QgcGVyY2VudCA9IHNwYW4oJzAlIGRvbmUnKTsKICAgIGxpLmFwcGVuZENoaWxkKHBlcmNlbnQpOwoKICAgIG91dHB1dEVsZW1lbnQuYXBwZW5kQ2hpbGQobGkpOwoKICAgIGNvbnN0IGZpbGVEYXRhUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICAgIGNvbnN0IHJlYWRlciA9IG5ldyBGaWxlUmVhZGVyKCk7CiAgICAgIHJlYWRlci5vbmxvYWQgPSAoZSkgPT4gewogICAgICAgIHJlc29sdmUoZS50YXJnZXQucmVzdWx0KTsKICAgICAgfTsKICAgICAgcmVhZGVyLnJlYWRBc0FycmF5QnVmZmVyKGZpbGUpOwogICAgfSk7CiAgICAvLyBXYWl0IGZvciB0aGUgZGF0YSB0byBiZSByZWFkeS4KICAgIGxldCBmaWxlRGF0YSA9IHlpZWxkIHsKICAgICAgcHJvbWlzZTogZmlsZURhdGFQcm9taXNlLAogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbnRpbnVlJywKICAgICAgfQogICAgfTsKCiAgICAvLyBVc2UgYSBjaHVua2VkIHNlbmRpbmcgdG8gYXZvaWQgbWVzc2FnZSBzaXplIGxpbWl0cy4gU2VlIGIvNjIxMTU2NjAuCiAgICBsZXQgcG9zaXRpb24gPSAwOwogICAgd2hpbGUgKHBvc2l0aW9uIDwgZmlsZURhdGEuYnl0ZUxlbmd0aCkgewogICAgICBjb25zdCBsZW5ndGggPSBNYXRoLm1pbihmaWxlRGF0YS5ieXRlTGVuZ3RoIC0gcG9zaXRpb24sIE1BWF9QQVlMT0FEX1NJWkUpOwogICAgICBjb25zdCBjaHVuayA9IG5ldyBVaW50OEFycmF5KGZpbGVEYXRhLCBwb3NpdGlvbiwgbGVuZ3RoKTsKICAgICAgcG9zaXRpb24gKz0gbGVuZ3RoOwoKICAgICAgY29uc3QgYmFzZTY0ID0gYnRvYShTdHJpbmcuZnJvbUNoYXJDb2RlLmFwcGx5KG51bGwsIGNodW5rKSk7CiAgICAgIHlpZWxkIHsKICAgICAgICByZXNwb25zZTogewogICAgICAgICAgYWN0aW9uOiAnYXBwZW5kJywKICAgICAgICAgIGZpbGU6IGZpbGUubmFtZSwKICAgICAgICAgIGRhdGE6IGJhc2U2NCwKICAgICAgICB9LAogICAgICB9OwogICAgICBwZXJjZW50LnRleHRDb250ZW50ID0KICAgICAgICAgIGAke01hdGgucm91bmQoKHBvc2l0aW9uIC8gZmlsZURhdGEuYnl0ZUxlbmd0aCkgKiAxMDApfSUgZG9uZWA7CiAgICB9CiAgfQoKICAvLyBBbGwgZG9uZS4KICB5aWVsZCB7CiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICB9CiAgfTsKfQoKc2NvcGUuZ29vZ2xlID0gc2NvcGUuZ29vZ2xlIHx8IHt9OwpzY29wZS5nb29nbGUuY29sYWIgPSBzY29wZS5nb29nbGUuY29sYWIgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYi5fZmlsZXMgPSB7CiAgX3VwbG9hZEZpbGVzLAogIF91cGxvYWRGaWxlc0NvbnRpbnVlLAp9Owp9KShzZWxmKTsK", + "ok": true, + "headers": [ + [ + "content-type", + "application/javascript" + ] + ], + "status": 200, + "status_text": "OK" + } + }, + "base_uri": "https://localhost:8080/", + "height": 246 + }, + "outputId": "cb0ca2ad-73b9-44ce-d0bd-6139552232d8" + }, + "source": [ + "from google.colab import files\n", + "file_list = files.upload()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " Upload widget is only available when the cell has been executed in the\n", + " current browser session. Please rerun this cell to enable.\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "stream", + "text": [ + "Saving p238_001.wav to p238_001.wav\n", + "Saving p238_002.wav to p238_002.wav\n", + "Saving p238_003.wav to p238_003.wav\n", + "Saving p238_004.wav to p238_004.wav\n", + "Saving p238_005.wav to p238_005.wav\n", + "Saving p238_006.wav to p238_006.wav\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "WWOf6sgbBbGY", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# extract embedding from wav files\n", + "speaker_embeddings = []\n", + "for name in file_list.keys():\n", + " if '.wav' in name:\n", + " mel_spec = se_ap.melspectrogram(se_ap.load_wav(name, sr=se_ap.sample_rate)).T\n", + " mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n", + " if USE_CUDA:\n", + " mel_spec = mel_spec.cuda()\n", + " embedd = se_model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n", + " speaker_embeddings.append(embedd)\n", + " else:\n", + " print(\" You need upload Wav files, others files is not supported !!\")\n", + "\n", + "# takes the average of the embedings samples of the announcers\n", + "speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "xmItcGac5WiG", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 872 + }, + "outputId": "4940027a-a303-4782-ef47-0c91716665a1" + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n", + "while True:\n", + " TEXT = input(\"Enter sentence: \")\n", + " print(\" > Text: {}\".format(TEXT))\n", + " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n", + " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + " # save the results\n", + " file_name = TEXT.replace(\" \", \"_\")\n", + " file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + " out_path = os.path.join(OUT_PATH, file_name)\n", + " print(\" > Saving output to {}\".format(out_path))\n", + " ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Synthesize sentence with New Speaker using files: dict_keys(['p238_001.wav', 'p238_002.wav', 'p238_003.wav', 'p238_004.wav', 'p238_005.wav', 'p238_006.wav']) (this speaker not seen in training (new speaker))\n", + "Enter sentence: Test this demonstration\n", + " > Text: Test this demonstration\n", + " > Run-time: 1.1401546001434326\n", + " > Real-time factor: 0.8927653497084975\n", + " > Time per step: 4.0488326075402176e-05\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "stream", + "text": [ + " > Saving output to tests-audios/Test_this_demonstration.wav\n", + "Enter sentence: Ask her to bring these things with her from the store.\n", + " > Text: Ask her to bring these things with her from the store.\n", + " > Run-time: 1.594754934310913\n", + " > Real-time factor: 0.6868010717444122\n", + " > Time per step: 3.1147492118179796e-05\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "stream", + "text": [ + " > Saving output to tests-audios/Ask_her_to_bring_these_things_with_her_from_the_store.wav\n" + ], + "name": "stdout" + }, + { + "output_type": "error", + "ename": "KeyboardInterrupt", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36m_input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 728\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 729\u001b[0;31m \u001b[0mident\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreply\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstdin_socket\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 730\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/jupyter_client/session.py\u001b[0m in \u001b[0;36mrecv\u001b[0;34m(self, socket, mode, content, copy)\u001b[0m\n\u001b[1;32m 802\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 803\u001b[0;31m \u001b[0mmsg_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msocket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_multipart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 804\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mzmq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mZMQError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/zmq/sugar/socket.py\u001b[0m in \u001b[0;36mrecv_multipart\u001b[0;34m(self, flags, copy, track)\u001b[0m\n\u001b[1;32m 474\u001b[0m \"\"\"\n\u001b[0;32m--> 475\u001b[0;31m \u001b[0mparts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mflags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrack\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrack\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 476\u001b[0m \u001b[0;31m# have first part already, only loop while more to receive\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket.Socket.recv\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket.Socket.recv\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket._recv_copy\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/zmq/backend/cython/checkrc.pxd\u001b[0m in \u001b[0;36mzmq.backend.cython.checkrc._check_rc\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: ", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Synthesize sentence with New Speaker using files: \"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mfile_list\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"(this speaker not seen in training (new speaker))\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mTEXT\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Enter sentence: \"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\" > Text: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mTEXT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvocoder_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTEXT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mUSE_CUDA\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_griffin_lim\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mSPEAKER_FILEID\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_embedding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_embedding\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36mraw_input\u001b[0;34m(self, prompt)\u001b[0m\n\u001b[1;32m 702\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent_ident\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 703\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent_header\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 704\u001b[0;31m \u001b[0mpassword\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 705\u001b[0m )\n\u001b[1;32m 706\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36m_input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 732\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 733\u001b[0m \u001b[0;31m# re-raise KeyboardInterrupt, to truncate traceback\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 734\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 735\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 736\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ] + } + ] +} \ No newline at end of file From 66452918b784c71b5c3621c57b74b1276476d3a4 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 30 Jul 2020 22:33:47 -0300 Subject: [PATCH 37/56] clean Colab Notebook outputs --- ...illa_TTS_MultiSpeaker_jia_et_al_2018.ipynb | 1169 +---------------- 1 file changed, 24 insertions(+), 1145 deletions(-) diff --git a/notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018.ipynb b/notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018.ipynb index 3581b6ab..458422c0 100755 --- a/notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018.ipynb +++ b/notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018.ipynb @@ -33,43 +33,21 @@ "metadata": { "id": "yvb0pX3WY6MN", "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 139 - }, - "outputId": "5c5016ac-4f92-4b5b-bcf3-473363b2846d" + "colab": {} }, "source": [ "import os \n", "!git clone https://github.com/Edresson/TTS -b dev-gst-embeddings" ], "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Cloning into 'TTS'...\n", - "remote: Enumerating objects: 15, done.\u001b[K\n", - "remote: Counting objects: 6% (1/15)\u001b[K\rremote: Counting objects: 13% (2/15)\u001b[K\rremote: Counting objects: 20% (3/15)\u001b[K\rremote: Counting objects: 26% (4/15)\u001b[K\rremote: Counting objects: 33% (5/15)\u001b[K\rremote: Counting objects: 40% (6/15)\u001b[K\rremote: Counting objects: 46% (7/15)\u001b[K\rremote: Counting objects: 53% (8/15)\u001b[K\rremote: Counting objects: 60% (9/15)\u001b[K\rremote: Counting objects: 66% (10/15)\u001b[K\rremote: Counting objects: 73% (11/15)\u001b[K\rremote: Counting objects: 80% (12/15)\u001b[K\rremote: Counting objects: 86% (13/15)\u001b[K\rremote: Counting objects: 93% (14/15)\u001b[K\rremote: Counting objects: 100% (15/15)\u001b[K\rremote: Counting objects: 100% (15/15), done.\u001b[K\n", - "remote: Compressing objects: 100% (14/14), done.\u001b[K\n", - "remote: Total 10370 (delta 1), reused 3 (delta 1), pack-reused 10355\u001b[K\n", - "Receiving objects: 100% (10370/10370), 120.77 MiB | 33.68 MiB/s, done.\n", - "Resolving deltas: 100% (6183/6183), done.\n" - ], - "name": "stdout" - } - ] + "outputs": [] }, { "cell_type": "code", "metadata": { "id": "iB9nl2UEG3SY", "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "outputId": "36bd08c3-6f29-4c03-9dc6-3cc3cc538e30" + "colab": {} }, "source": [ "!apt-get install espeak\n", @@ -79,639 +57,7 @@ "os.chdir('..')" ], "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Reading package lists... Done\n", - "Building dependency tree \n", - "Reading state information... Done\n", - "The following package was automatically installed and is no longer required:\n", - " libnvidia-common-440\n", - "Use 'apt autoremove' to remove it.\n", - "The following additional packages will be installed:\n", - " espeak-data libespeak1 libportaudio2 libsonic0\n", - "The following NEW packages will be installed:\n", - " espeak espeak-data libespeak1 libportaudio2 libsonic0\n", - "0 upgraded, 5 newly installed, 0 to remove and 35 not upgraded.\n", - "Need to get 1,219 kB of archives.\n", - "After this operation, 3,031 kB of additional disk space will be used.\n", - "Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libportaudio2 amd64 19.6.0-1 [64.6 kB]\n", - "Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 libsonic0 amd64 0.2.0-6 [13.4 kB]\n", - "Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 espeak-data amd64 1.48.04+dfsg-5 [934 kB]\n", - "Get:4 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libespeak1 amd64 1.48.04+dfsg-5 [145 kB]\n", - "Get:5 http://archive.ubuntu.com/ubuntu bionic/universe amd64 espeak amd64 1.48.04+dfsg-5 [61.6 kB]\n", - "Fetched 1,219 kB in 1s (820 kB/s)\n", - "Selecting previously unselected package libportaudio2:amd64.\n", - "(Reading database ... 144465 files and directories currently installed.)\n", - "Preparing to unpack .../libportaudio2_19.6.0-1_amd64.deb ...\n", - "Unpacking libportaudio2:amd64 (19.6.0-1) ...\n", - "Selecting previously unselected package libsonic0:amd64.\n", - "Preparing to unpack .../libsonic0_0.2.0-6_amd64.deb ...\n", - "Unpacking libsonic0:amd64 (0.2.0-6) ...\n", - "Selecting previously unselected package espeak-data:amd64.\n", - "Preparing to unpack .../espeak-data_1.48.04+dfsg-5_amd64.deb ...\n", - "Unpacking espeak-data:amd64 (1.48.04+dfsg-5) ...\n", - "Selecting previously unselected package libespeak1:amd64.\n", - "Preparing to unpack .../libespeak1_1.48.04+dfsg-5_amd64.deb ...\n", - "Unpacking libespeak1:amd64 (1.48.04+dfsg-5) ...\n", - "Selecting previously unselected package espeak.\n", - "Preparing to unpack .../espeak_1.48.04+dfsg-5_amd64.deb ...\n", - "Unpacking espeak (1.48.04+dfsg-5) ...\n", - "Setting up libportaudio2:amd64 (19.6.0-1) ...\n", - "Setting up espeak-data:amd64 (1.48.04+dfsg-5) ...\n", - "Setting up libsonic0:amd64 (0.2.0-6) ...\n", - "Setting up libespeak1:amd64 (1.48.04+dfsg-5) ...\n", - "Setting up espeak (1.48.04+dfsg-5) ...\n", - "Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n", - "Processing triggers for libc-bin (2.27-3ubuntu1) ...\n", - "/sbin/ldconfig.real: /usr/local/lib/python3.6/dist-packages/ideep4py/lib/libmkldnn.so.0 is not a symbolic link\n", - "\n", - "Requirement already satisfied: torch>=1.5 in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 1)) (1.5.1+cu101)\n", - "Requirement already satisfied: tensorflow>=2.2 in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 2)) (2.2.0)\n", - "Requirement already satisfied: numpy>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 3)) (1.18.5)\n", - "Requirement already satisfied: scipy>=0.19.0 in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 4)) (1.4.1)\n", - "Requirement already satisfied: numba==0.48 in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 5)) (0.48.0)\n", - "Collecting librosa==0.7.2\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/77/b5/1817862d64a7c231afd15419d8418ae1f000742cac275e85c74b219cbccb/librosa-0.7.2.tar.gz (1.6MB)\n", - "\u001b[K |████████████████████████████████| 1.6MB 4.7MB/s \n", - "\u001b[?25hCollecting phonemizer>=2.2.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e1/ec/eb95ec96dc5374ba339d47c5252d62e9a7b1fbd5c9e4eefebc0d7e25381d/phonemizer-2.2.1-py3-none-any.whl (49kB)\n", - "\u001b[K |████████████████████████████████| 51kB 9.0MB/s \n", - "\u001b[?25hCollecting unidecode==0.4.20\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/c3/6f/05f5deb753d0594583aa1cc0d2fe9d631d9a00e9b28d0da49f8d3763755b/Unidecode-0.04.20-py2.py3-none-any.whl (228kB)\n", - "\u001b[K |████████████████████████████████| 235kB 30.0MB/s \n", - "\u001b[?25hCollecting attrdict\n", - " Downloading https://files.pythonhosted.org/packages/ef/97/28fe7e68bc7adfce67d4339756e85e9fcf3c6fd7f0c0781695352b70472c/attrdict-2.0.1-py2.py3-none-any.whl\n", - "Collecting tensorboardX\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/af/0c/4f41bcd45db376e6fe5c619c01100e9b7531c55791b7244815bac6eac32c/tensorboardX-2.1-py2.py3-none-any.whl (308kB)\n", - "\u001b[K |████████████████████████████████| 317kB 16.7MB/s \n", - "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 11)) (3.2.2)\n", - "Requirement already satisfied: Pillow in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 12)) (7.0.0)\n", - "Requirement already satisfied: flask in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 13)) (1.1.2)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 14)) (4.41.1)\n", - "Requirement already satisfied: inflect in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 15)) (2.1.0)\n", - "Collecting bokeh==1.4.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/de/70/fdd4b186d8570a737372487cc5547aac885a1270626e3ebf03db1808e4ed/bokeh-1.4.0.tar.gz (32.4MB)\n", - "\u001b[K |████████████████████████████████| 32.4MB 100kB/s \n", - "\u001b[?25hCollecting pysbd\n", - " Downloading https://files.pythonhosted.org/packages/3b/49/4799b3cdf80aee5fa4562a3929eda738845900bbeef4ee60481196ad4d1a/pysbd-0.2.3-py3-none-any.whl\n", - "Collecting soundfile\n", - " Downloading https://files.pythonhosted.org/packages/eb/f2/3cbbbf3b96fb9fa91582c438b574cff3f45b29c772f94c400e2c99ef5db9/SoundFile-0.10.3.post1-py2.py3-none-any.whl\n", - "Collecting nose==1.3.7\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/15/d8/dd071918c040f50fa1cf80da16423af51ff8ce4a0f2399b7bf8de45ac3d9/nose-1.3.7-py3-none-any.whl (154kB)\n", - "\u001b[K |████████████████████████████████| 163kB 54.4MB/s \n", - "\u001b[?25hCollecting cardboardlint==1.3.0\n", - " Downloading https://files.pythonhosted.org/packages/a8/d4/02c9ad87226867995e8cc89791ba3a5a653e1d25c04263adabe87b7e1472/cardboardlint-1.3.0.tar.gz\n", - "Collecting pylint==2.5.3\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e8/fb/734960c55474c8f74e6ad4c8588fc44073fb9d69e223269d26a3c2435d16/pylint-2.5.3-py3-none-any.whl (324kB)\n", - "\u001b[K |████████████████████████████████| 327kB 55.3MB/s \n", - "\u001b[?25hCollecting fuzzywuzzy\n", - " Downloading https://files.pythonhosted.org/packages/43/ff/74f23998ad2f93b945c0309f825be92e04e0348e062026998b5eefef4c33/fuzzywuzzy-0.18.0-py2.py3-none-any.whl\n", - "Requirement already satisfied: gdown in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 23)) (3.6.4)\n", - "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch>=1.5->-r requirements.txt (line 1)) (0.16.0)\n", - "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (1.1.0)\n", - "Requirement already satisfied: astunparse==1.6.3 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (1.6.3)\n", - "Requirement already satisfied: absl-py>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (0.9.0)\n", - "Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (3.3.0)\n", - "Requirement already satisfied: six>=1.12.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (1.15.0)\n", - "Requirement already satisfied: protobuf>=3.8.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (3.12.2)\n", - "Requirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (1.12.1)\n", - "Requirement already satisfied: gast==0.3.3 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (0.3.3)\n", - "Requirement already satisfied: keras-preprocessing>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (1.1.2)\n", - "Requirement already satisfied: h5py<2.11.0,>=2.10.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (2.10.0)\n", - "Requirement already satisfied: google-pasta>=0.1.8 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (0.2.0)\n", - "Requirement already satisfied: tensorboard<2.3.0,>=2.2.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (2.2.2)\n", - "Requirement already satisfied: wheel>=0.26; python_version >= \"3\" in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (0.34.2)\n", - "Requirement already satisfied: tensorflow-estimator<2.3.0,>=2.2.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (2.2.0)\n", - "Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow>=2.2->-r requirements.txt (line 2)) (1.30.0)\n", - "Requirement already satisfied: llvmlite<0.32.0,>=0.31.0dev0 in /usr/local/lib/python3.6/dist-packages (from numba==0.48->-r requirements.txt (line 5)) (0.31.0)\n", - "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from numba==0.48->-r requirements.txt (line 5)) (49.1.0)\n", - "Requirement already satisfied: audioread>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from librosa==0.7.2->-r requirements.txt (line 6)) (2.1.8)\n", - "Requirement already satisfied: scikit-learn!=0.19.0,>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from librosa==0.7.2->-r requirements.txt (line 6)) (0.22.2.post1)\n", - "Requirement already satisfied: joblib>=0.12 in /usr/local/lib/python3.6/dist-packages (from librosa==0.7.2->-r requirements.txt (line 6)) (0.16.0)\n", - "Requirement already satisfied: decorator>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from librosa==0.7.2->-r requirements.txt (line 6)) (4.4.2)\n", - "Requirement already satisfied: resampy>=0.2.2 in /usr/local/lib/python3.6/dist-packages (from librosa==0.7.2->-r requirements.txt (line 6)) (0.2.2)\n", - "Requirement already satisfied: attrs>=18.1 in /usr/local/lib/python3.6/dist-packages (from phonemizer>=2.2.0->-r requirements.txt (line 7)) (19.3.0)\n", - "Collecting segments\n", - " Downloading https://files.pythonhosted.org/packages/5b/a0/0c3fe64787745c39eb3f2f5f5f9ed8d008d9ef22e9d7f9f52f71ea4712f7/segments-2.1.3-py2.py3-none-any.whl\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->-r requirements.txt (line 11)) (0.10.0)\n", - "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->-r requirements.txt (line 11)) (2.8.1)\n", - "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->-r requirements.txt (line 11)) (2.4.7)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->-r requirements.txt (line 11)) (1.2.0)\n", - "Requirement already satisfied: click>=5.1 in /usr/local/lib/python3.6/dist-packages (from flask->-r requirements.txt (line 13)) (7.1.2)\n", - "Requirement already satisfied: Jinja2>=2.10.1 in /usr/local/lib/python3.6/dist-packages (from flask->-r requirements.txt (line 13)) (2.11.2)\n", - "Requirement already satisfied: itsdangerous>=0.24 in /usr/local/lib/python3.6/dist-packages (from flask->-r requirements.txt (line 13)) (1.1.0)\n", - "Requirement already satisfied: Werkzeug>=0.15 in /usr/local/lib/python3.6/dist-packages (from flask->-r requirements.txt (line 13)) (1.0.1)\n", - "Requirement already satisfied: PyYAML>=3.10 in /usr/local/lib/python3.6/dist-packages (from bokeh==1.4.0->-r requirements.txt (line 16)) (3.13)\n", - "Requirement already satisfied: packaging>=16.8 in /usr/local/lib/python3.6/dist-packages (from bokeh==1.4.0->-r requirements.txt (line 16)) (20.4)\n", - "Requirement already satisfied: tornado>=4.3 in /usr/local/lib/python3.6/dist-packages (from bokeh==1.4.0->-r requirements.txt (line 16)) (5.1.1)\n", - "Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.6/dist-packages (from soundfile->-r requirements.txt (line 18)) (1.14.0)\n", - "Collecting isort<5,>=4.2.5\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e5/b0/c121fd1fa3419ea9bfd55c7f9c4fedfec5143208d8c7ad3ce3db6c623c21/isort-4.3.21-py2.py3-none-any.whl (42kB)\n", - "\u001b[K |████████████████████████████████| 51kB 7.2MB/s \n", - "\u001b[?25hCollecting astroid<=2.5,>=2.4.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/24/a8/5133f51967fb21e46ee50831c3f5dda49e976b7f915408d670b1603d41d6/astroid-2.4.2-py3-none-any.whl (213kB)\n", - "\u001b[K |████████████████████████████████| 215kB 55.8MB/s \n", - "\u001b[?25hCollecting toml>=0.7.1\n", - " Downloading https://files.pythonhosted.org/packages/9f/e1/1b40b80f2e1663a6b9f497123c11d7d988c0919abbf3c3f2688e448c5363/toml-0.10.1-py2.py3-none-any.whl\n", - "Collecting mccabe<0.7,>=0.6\n", - " Downloading https://files.pythonhosted.org/packages/87/89/479dc97e18549e21354893e4ee4ef36db1d237534982482c3681ee6e7b57/mccabe-0.6.1-py2.py3-none-any.whl\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from gdown->-r requirements.txt (line 23)) (2.23.0)\n", - "Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (1.17.2)\n", - "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (0.4.1)\n", - "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (1.7.0)\n", - "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (3.2.2)\n", - "Collecting csvw>=1.5.6\n", - " Downloading https://files.pythonhosted.org/packages/d1/b6/8fef6788b8f05b21424a17ae3881eff916d42e5c7e87f57a85d9d7abf0a1/csvw-1.7.0-py2.py3-none-any.whl\n", - "Requirement already satisfied: regex in /usr/local/lib/python3.6/dist-packages (from segments->phonemizer>=2.2.0->-r requirements.txt (line 7)) (2019.12.20)\n", - "Collecting clldutils>=1.7.3\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7b/b3/05882a8d5c8a7f7c69a47500334ac99623928edca930278d6ab88ee6d99b/clldutils-3.5.2-py2.py3-none-any.whl (189kB)\n", - "\u001b[K |████████████████████████████████| 194kB 49.8MB/s \n", - "\u001b[?25hRequirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.6/dist-packages (from Jinja2>=2.10.1->flask->-r requirements.txt (line 13)) (1.1.1)\n", - "Requirement already satisfied: pycparser in /usr/local/lib/python3.6/dist-packages (from cffi>=1.0->soundfile->-r requirements.txt (line 18)) (2.20)\n", - "Collecting lazy-object-proxy==1.4.*\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/0b/dd/b1e3407e9e6913cf178e506cd0dee818e58694d9a5cd1984e3f6a8b9a10f/lazy_object_proxy-1.4.3-cp36-cp36m-manylinux1_x86_64.whl (55kB)\n", - "\u001b[K |████████████████████████████████| 61kB 8.6MB/s \n", - "\u001b[?25hCollecting typed-ast<1.5,>=1.4.0; implementation_name == \"cpython\" and python_version < \"3.8\"\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/90/ed/5459080d95eb87a02fe860d447197be63b6e2b5e9ff73c2b0a85622994f4/typed_ast-1.4.1-cp36-cp36m-manylinux1_x86_64.whl (737kB)\n", - "\u001b[K |████████████████████████████████| 747kB 54.2MB/s \n", - "\u001b[?25hRequirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->gdown->-r requirements.txt (line 23)) (2020.6.20)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->gdown->-r requirements.txt (line 23)) (3.0.4)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->gdown->-r requirements.txt (line 23)) (1.24.3)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->gdown->-r requirements.txt (line 23)) (2.10)\n", - "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (4.1.1)\n", - "Requirement already satisfied: rsa<5,>=3.1.4; python_version >= \"3\" in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (4.6)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (0.2.8)\n", - "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (1.3.0)\n", - "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from markdown>=2.6.8->tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (1.7.0)\n", - "Collecting isodate\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/9b/9f/b36f7774ff5ea8e428fdcfc4bb332c39ee5b9362ddd3d40d9516a55221b2/isodate-0.6.0-py2.py3-none-any.whl (45kB)\n", - "\u001b[K |████████████████████████████████| 51kB 9.4MB/s \n", - "\u001b[?25hCollecting rfc3986\n", - " Downloading https://files.pythonhosted.org/packages/78/be/7b8b99fd74ff5684225f50dd0e865393d2265656ef3b4ba9eaaaffe622b8/rfc3986-1.4.0-py2.py3-none-any.whl\n", - "Requirement already satisfied: uritemplate>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from csvw>=1.5.6->segments->phonemizer>=2.2.0->-r requirements.txt (line 7)) (3.0.1)\n", - "Collecting colorlog\n", - " Downloading https://files.pythonhosted.org/packages/2a/81/12d77537c82c5d46aa2721dfee25a0e873ef5920ebd0827152f411effb57/colorlog-4.2.1-py2.py3-none-any.whl\n", - "Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.6/dist-packages (from clldutils>=1.7.3->segments->phonemizer>=2.2.0->-r requirements.txt (line 7)) (0.8.7)\n", - "Requirement already satisfied: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<5,>=3.1.4; python_version >= \"3\"->google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (0.4.8)\n", - "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (3.1.0)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard<2.3.0,>=2.2.0->tensorflow>=2.2->-r requirements.txt (line 2)) (3.1.0)\n", - "Building wheels for collected packages: librosa, bokeh, cardboardlint\n", - " Building wheel for librosa (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for librosa: filename=librosa-0.7.2-cp36-none-any.whl size=1612885 sha256=772045cdc2b1e22f5b5658822ab21ff12ed23871b729a83f7ab71702bd988fa2\n", - " Stored in directory: /root/.cache/pip/wheels/4c/6e/d7/bb93911540d2d1e44d690a1561871e5b6af82b69e80938abef\n", - " Building wheel for bokeh (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for bokeh: filename=bokeh-1.4.0-cp36-none-any.whl size=23689201 sha256=cad6fa791cc4e4304ea896a3ca0a192ca76b3e19fb6f4b9451c557f5488ff85e\n", - " Stored in directory: /root/.cache/pip/wheels/fb/f8/47/09700d9a19cbcbf0b7a3130690b75c0d6ff80fbda0b1774c7c\n", - " Building wheel for cardboardlint (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for cardboardlint: filename=cardboardlint-1.3.0-cp36-none-any.whl size=51334 sha256=784dae96378ea33bba49caa613246afb18bffc40892c8943e3dbfab99e312f7a\n", - " Stored in directory: /root/.cache/pip/wheels/e6/fe/8e/fbeb2cda877a1f772c0462f9f07eed851f143020947a075ee9\n", - "Successfully built librosa bokeh cardboardlint\n", - "\u001b[31mERROR: panel 0.9.7 has requirement bokeh>=2.1, but you'll have bokeh 1.4.0 which is incompatible.\u001b[0m\n", - "Installing collected packages: soundfile, librosa, isodate, rfc3986, csvw, colorlog, clldutils, segments, phonemizer, unidecode, attrdict, tensorboardX, bokeh, pysbd, nose, cardboardlint, isort, lazy-object-proxy, typed-ast, astroid, toml, mccabe, pylint, fuzzywuzzy\n", - " Found existing installation: librosa 0.6.3\n", - " Uninstalling librosa-0.6.3:\n", - " Successfully uninstalled librosa-0.6.3\n", - " Found existing installation: bokeh 2.1.1\n", - " Uninstalling bokeh-2.1.1:\n", - " Successfully uninstalled bokeh-2.1.1\n", - "Successfully installed astroid-2.4.2 attrdict-2.0.1 bokeh-1.4.0 cardboardlint-1.3.0 clldutils-3.5.2 colorlog-4.2.1 csvw-1.7.0 fuzzywuzzy-0.18.0 isodate-0.6.0 isort-4.3.21 lazy-object-proxy-1.4.3 librosa-0.7.2 mccabe-0.6.1 nose-1.3.7 phonemizer-2.2.1 pylint-2.5.3 pysbd-0.2.3 rfc3986-1.4.0 segments-2.1.3 soundfile-0.10.3.post1 tensorboardX-2.1 toml-0.10.1 typed-ast-1.4.1 unidecode-0.4.20\n", - "running develop\n", - "-- Building version 0.0.4+f0284e8\n", - "running egg_info\n", - "creating TTS.egg-info\n", - "writing TTS.egg-info/PKG-INFO\n", - "writing dependency_links to TTS.egg-info/dependency_links.txt\n", - "writing entry points to TTS.egg-info/entry_points.txt\n", - "writing requirements to TTS.egg-info/requires.txt\n", - "writing top-level names to TTS.egg-info/top_level.txt\n", - "writing manifest file 'TTS.egg-info/SOURCES.txt'\n", - "writing manifest file 'TTS.egg-info/SOURCES.txt'\n", - "running build_ext\n", - "Creating /usr/local/lib/python3.6/dist-packages/TTS.egg-link (link to .)\n", - "Adding TTS 0.0.4+f0284e8 to easy-install.pth file\n", - "Installing tts-server script to /usr/local/bin\n", - "\n", - "Installed /content/TTS\n", - "Processing dependencies for TTS==0.0.4+f0284e8\n", - "Searching for gdown==3.6.4\n", - "Best match: gdown 3.6.4\n", - "Adding gdown 3.6.4 to easy-install.pth file\n", - "Installing gdown script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for fuzzywuzzy==0.18.0\n", - "Best match: fuzzywuzzy 0.18.0\n", - "Adding fuzzywuzzy 0.18.0 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for pylint==2.5.3\n", - "Best match: pylint 2.5.3\n", - "Adding pylint 2.5.3 to easy-install.pth file\n", - "Installing epylint script to /usr/local/bin\n", - "Installing pylint script to /usr/local/bin\n", - "Installing pyreverse script to /usr/local/bin\n", - "Installing symilar script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for cardboardlint==1.3.0\n", - "Best match: cardboardlint 1.3.0\n", - "Adding cardboardlint 1.3.0 to easy-install.pth file\n", - "Installing cardboardlinter script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for nose==1.3.7\n", - "Best match: nose 1.3.7\n", - "Adding nose 1.3.7 to easy-install.pth file\n", - "Installing nosetests script to /usr/local/bin\n", - "Installing nosetests-3.4 script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for phonemizer==2.2.1\n", - "Best match: phonemizer 2.2.1\n", - "Adding phonemizer 2.2.1 to easy-install.pth file\n", - "Installing phonemize script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for SoundFile==0.10.3.post1\n", - "Best match: SoundFile 0.10.3.post1\n", - "Adding SoundFile 0.10.3.post1 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for bokeh==1.4.0\n", - "Best match: bokeh 1.4.0\n", - "Adding bokeh 1.4.0 to easy-install.pth file\n", - "Installing bokeh script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for pysbd==0.2.3\n", - "Best match: pysbd 0.2.3\n", - "Adding pysbd 0.2.3 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for inflect==2.1.0\n", - "Best match: inflect 2.1.0\n", - "Adding inflect 2.1.0 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for tqdm==4.41.1\n", - "Best match: tqdm 4.41.1\n", - "Adding tqdm 4.41.1 to easy-install.pth file\n", - "Installing tqdm script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for Flask==1.1.2\n", - "Best match: Flask 1.1.2\n", - "Adding Flask 1.1.2 to easy-install.pth file\n", - "Installing flask script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for Pillow==7.0.0\n", - "Best match: Pillow 7.0.0\n", - "Adding Pillow 7.0.0 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for matplotlib==3.2.2\n", - "Best match: matplotlib 3.2.2\n", - "Adding matplotlib 3.2.2 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for tensorboardX==2.1\n", - "Best match: tensorboardX 2.1\n", - "Adding tensorboardX 2.1 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for attrdict==2.0.1\n", - "Best match: attrdict 2.0.1\n", - "Adding attrdict 2.0.1 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for Unidecode==0.4.20\n", - "Best match: Unidecode 0.4.20\n", - "Adding Unidecode 0.4.20 to easy-install.pth file\n", - "Installing unidecode script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for librosa==0.7.2\n", - "Best match: librosa 0.7.2\n", - "Adding librosa 0.7.2 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for scipy==1.4.1\n", - "Best match: scipy 1.4.1\n", - "Adding scipy 1.4.1 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for numba==0.48.0\n", - "Best match: numba 0.48.0\n", - "Adding numba 0.48.0 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for numpy==1.18.5\n", - "Best match: numpy 1.18.5\n", - "Adding numpy 1.18.5 to easy-install.pth file\n", - "Installing f2py script to /usr/local/bin\n", - "Installing f2py3 script to /usr/local/bin\n", - "Installing f2py3.6 script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for torch==1.5.1+cu101\n", - "Best match: torch 1.5.1+cu101\n", - "Adding torch 1.5.1+cu101 to easy-install.pth file\n", - "Installing convert-caffe2-to-onnx script to /usr/local/bin\n", - "Installing convert-onnx-to-caffe2 script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for requests==2.23.0\n", - "Best match: requests 2.23.0\n", - "Adding requests 2.23.0 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for six==1.15.0\n", - "Best match: six 1.15.0\n", - "Adding six 1.15.0 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for toml==0.10.1\n", - "Best match: toml 0.10.1\n", - "Adding toml 0.10.1 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for isort==4.3.21\n", - "Best match: isort 4.3.21\n", - "Adding isort 4.3.21 to easy-install.pth file\n", - "Installing isort script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for mccabe==0.6.1\n", - "Best match: mccabe 0.6.1\n", - "Adding mccabe 0.6.1 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for astroid==2.4.2\n", - "Best match: astroid 2.4.2\n", - "Adding astroid 2.4.2 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for PyYAML==3.13\n", - "Best match: PyYAML 3.13\n", - "Adding PyYAML 3.13 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for joblib==0.16.0\n", - "Best match: joblib 0.16.0\n", - "Adding joblib 0.16.0 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for attrs==19.3.0\n", - "Best match: attrs 19.3.0\n", - "Adding attrs 19.3.0 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for segments==2.1.3\n", - "Best match: segments 2.1.3\n", - "Adding segments 2.1.3 to easy-install.pth file\n", - "Installing segments script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for cffi==1.14.0\n", - "Best match: cffi 1.14.0\n", - "Adding cffi 1.14.0 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for packaging==20.4\n", - "Best match: packaging 20.4\n", - "Adding packaging 20.4 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for tornado==5.1.1\n", - "Best match: tornado 5.1.1\n", - "Adding tornado 5.1.1 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for Jinja2==2.11.2\n", - "Best match: Jinja2 2.11.2\n", - "Adding Jinja2 2.11.2 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for python-dateutil==2.8.1\n", - "Best match: python-dateutil 2.8.1\n", - "Adding python-dateutil 2.8.1 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for click==7.1.2\n", - "Best match: click 7.1.2\n", - "Adding click 7.1.2 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for itsdangerous==1.1.0\n", - "Best match: itsdangerous 1.1.0\n", - "Adding itsdangerous 1.1.0 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for Werkzeug==1.0.1\n", - "Best match: Werkzeug 1.0.1\n", - "Adding Werkzeug 1.0.1 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for cycler==0.10.0\n", - "Best match: cycler 0.10.0\n", - "Adding cycler 0.10.0 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for kiwisolver==1.2.0\n", - "Best match: kiwisolver 1.2.0\n", - "Adding kiwisolver 1.2.0 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for pyparsing==2.4.7\n", - "Best match: pyparsing 2.4.7\n", - "Adding pyparsing 2.4.7 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for protobuf==3.12.2\n", - "Best match: protobuf 3.12.2\n", - "Adding protobuf 3.12.2 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for scikit-learn==0.22.2.post1\n", - "Best match: scikit-learn 0.22.2.post1\n", - "Adding scikit-learn 0.22.2.post1 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for audioread==2.1.8\n", - "Best match: audioread 2.1.8\n", - "Adding audioread 2.1.8 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for resampy==0.2.2\n", - "Best match: resampy 0.2.2\n", - "Adding resampy 0.2.2 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for decorator==4.4.2\n", - "Best match: decorator 4.4.2\n", - "Adding decorator 4.4.2 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for llvmlite==0.31.0\n", - "Best match: llvmlite 0.31.0\n", - "Adding llvmlite 0.31.0 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for setuptools==49.1.0\n", - "Best match: setuptools 49.1.0\n", - "Adding setuptools 49.1.0 to easy-install.pth file\n", - "Installing easy_install script to /usr/local/bin\n", - "Installing easy_install-3.8 script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for future==0.16.0\n", - "Best match: future 0.16.0\n", - "Adding future 0.16.0 to easy-install.pth file\n", - "Installing futurize script to /usr/local/bin\n", - "Installing pasteurize script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for chardet==3.0.4\n", - "Best match: chardet 3.0.4\n", - "Adding chardet 3.0.4 to easy-install.pth file\n", - "Installing chardetect script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for urllib3==1.24.3\n", - "Best match: urllib3 1.24.3\n", - "Adding urllib3 1.24.3 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for certifi==2020.6.20\n", - "Best match: certifi 2020.6.20\n", - "Adding certifi 2020.6.20 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for idna==2.10\n", - "Best match: idna 2.10\n", - "Adding idna 2.10 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for wrapt==1.12.1\n", - "Best match: wrapt 1.12.1\n", - "Adding wrapt 1.12.1 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for typed-ast==1.4.1\n", - "Best match: typed-ast 1.4.1\n", - "Adding typed-ast 1.4.1 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for lazy-object-proxy==1.4.3\n", - "Best match: lazy-object-proxy 1.4.3\n", - "Adding lazy-object-proxy 1.4.3 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for csvw==1.7.0\n", - "Best match: csvw 1.7.0\n", - "Adding csvw 1.7.0 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for clldutils==3.5.2\n", - "Best match: clldutils 3.5.2\n", - "Adding clldutils 3.5.2 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for regex==2019.12.20\n", - "Best match: regex 2019.12.20\n", - "Adding regex 2019.12.20 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for pycparser==2.20\n", - "Best match: pycparser 2.20\n", - "Adding pycparser 2.20 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for MarkupSafe==1.1.1\n", - "Best match: MarkupSafe 1.1.1\n", - "Adding MarkupSafe 1.1.1 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for isodate==0.6.0\n", - "Best match: isodate 0.6.0\n", - "Adding isodate 0.6.0 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for rfc3986==1.4.0\n", - "Best match: rfc3986 1.4.0\n", - "Adding rfc3986 1.4.0 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for uritemplate==3.0.1\n", - "Best match: uritemplate 3.0.1\n", - "Adding uritemplate 3.0.1 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for tabulate==0.8.7\n", - "Best match: tabulate 0.8.7\n", - "Adding tabulate 0.8.7 to easy-install.pth file\n", - "Installing tabulate script to /usr/local/bin\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Searching for colorlog==4.2.1\n", - "Best match: colorlog 4.2.1\n", - "Adding colorlog 4.2.1 to easy-install.pth file\n", - "\n", - "Using /usr/local/lib/python3.6/dist-packages\n", - "Finished processing dependencies for TTS==0.0.4+f0284e8\n", - "Collecting tensorflow==2.3.0rc0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/8b/68/7c6c8e2b65ad4a3ff5ef658c04a6c2802ff7fe55fc7eecacb6efee1abc40/tensorflow-2.3.0rc0-cp36-cp36m-manylinux2010_x86_64.whl (320.3MB)\n", - "\u001b[K |████████████████████████████████| 320.3MB 53kB/s \n", - "\u001b[?25hRequirement already satisfied: scipy==1.4.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.4.1)\n", - "Requirement already satisfied: protobuf>=3.9.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (3.12.2)\n", - "Requirement already satisfied: gast==0.3.3 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (0.3.3)\n", - "Collecting tf-estimator-nightly<2.3.0.dev2020062302,>=2.3.0.dev2020062301\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/17/3b/fb9aafd734da258411bff2a600cabff65c7d201782318791b72422bd973d/tf_estimator_nightly-2.3.0.dev2020062301-py2.py3-none-any.whl (459kB)\n", - "\u001b[K |████████████████████████████████| 460kB 58.0MB/s \n", - "\u001b[?25hRequirement already satisfied: h5py<2.11.0,>=2.10.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (2.10.0)\n", - "Requirement already satisfied: six>=1.12.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.15.0)\n", - "Requirement already satisfied: numpy<1.19.0,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.18.5)\n", - "Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (3.3.0)\n", - "Requirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (0.34.2)\n", - "Requirement already satisfied: tensorboard<2.3.0,>=2.2.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (2.2.2)\n", - "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.1.0)\n", - "Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.30.0)\n", - "Requirement already satisfied: astunparse==1.6.3 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.6.3)\n", - "Requirement already satisfied: keras-preprocessing<1.2,>=1.1.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.1.2)\n", - "Requirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (1.12.1)\n", - "Requirement already satisfied: google-pasta>=0.1.8 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (0.2.0)\n", - "Requirement already satisfied: absl-py>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow==2.3.0rc0) (0.9.0)\n", - "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf>=3.9.2->tensorflow==2.3.0rc0) (49.1.0)\n", - "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (0.4.1)\n", - "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (1.7.0)\n", - "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (2.23.0)\n", - "Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (1.17.2)\n", - "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (3.2.2)\n", - "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (1.0.1)\n", - "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (1.3.0)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (2.10)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (2020.6.20)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (1.24.3)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (3.0.4)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (0.2.8)\n", - "Requirement already satisfied: rsa<5,>=3.1.4; python_version >= \"3\" in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (4.6)\n", - "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (4.1.1)\n", - "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from markdown>=2.6.8->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (1.7.0)\n", - "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (3.1.0)\n", - "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.3->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (0.4.8)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard<2.3.0,>=2.2.0->tensorflow==2.3.0rc0) (3.1.0)\n", - "Installing collected packages: tf-estimator-nightly, tensorflow\n", - " Found existing installation: tensorflow 2.2.0\n", - " Uninstalling tensorflow-2.2.0:\n", - " Successfully uninstalled tensorflow-2.2.0\n", - "Successfully installed tensorflow-2.3.0rc0 tf-estimator-nightly-2.3.0.dev2020062301\n" - ], - "name": "stdout" - } - ] + "outputs": [] }, { "cell_type": "markdown", @@ -732,30 +78,14 @@ "metadata": { "id": "PiYHf3lKhi9z", "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 104 - }, - "outputId": "75f7ecd0-3ee2-4ed2-8150-49f71a5ef4d5" + "colab": {} }, "source": [ "!wget -c -q --show-progress -O ./TTS-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/Checkpoints-TTS-MultiSpeaker-Jia-et-al-2018.zip\n", "!unzip ./TTS-checkpoint.zip\n" ], "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "./TTS-checkpoint.zi 100%[===================>] 367.68M 31.9MB/s in 12s \n", - "Archive: ./TTS-checkpoint.zip\n", - " inflating: best_model.pth.tar \n", - " inflating: speakers.json \n", - " inflating: config.json \n" - ], - "name": "stdout" - } - ] + "outputs": [] }, { "cell_type": "markdown", @@ -772,11 +102,7 @@ "metadata": { "id": "4KZA4b_CbMqx", "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 52 - }, - "outputId": "c5d33fac-4682-4e72-8fc2-c1aa83408ef4" + "colab": {} }, "source": [ "%load_ext autoreload\n", @@ -825,16 +151,7 @@ "\n" ], "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ], - "name": "stdout" - } - ] + "outputs": [] }, { "cell_type": "markdown", @@ -890,11 +207,7 @@ "metadata": { "id": "x1WgLFauWUPe", "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 486 - }, - "outputId": "7cc2d0bd-f3c8-41dd-9f17-438f77c1f2d2" + "colab": {} }, "source": [ "# load the config\n", @@ -961,41 +274,7 @@ " SPEAKER_FILEID = None\n" ], "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - " > Setting up Audio Processor...\n", - " | > sample_rate:22050\n", - " | > num_mels:80\n", - " | > min_level_db:-100\n", - " | > frame_shift_ms:None\n", - " | > frame_length_ms:None\n", - " | > ref_level_db:20\n", - " | > fft_size:1024\n", - " | > power:1.5\n", - " | > preemphasis:0.98\n", - " | > griffin_lim_iters:60\n", - " | > signal_norm:True\n", - " | > symmetric_norm:True\n", - " | > mel_fmin:0\n", - " | > mel_fmax:8000.0\n", - " | > spec_gain:20.0\n", - " | > stft_pad_mode:constant\n", - " | > max_norm:4.0\n", - " | > clip_norm:True\n", - " | > do_trim_silence:False\n", - " | > trim_db:60\n", - " | > do_sound_norm:False\n", - " | > stats_path:None\n", - " | > hop_length:256\n", - " | > win_length:1024\n", - " Speaker: p262 was chosen automatically (this speaker seen in training)\n", - " > Using model: Tacotron2\n" - ], - "name": "stdout" - } - ] + "outputs": [] }, { "cell_type": "markdown", @@ -1015,11 +294,7 @@ "metadata": { "id": "2o8fXkVSyXOa", "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 704 - }, - "outputId": "b093ab6c-57a9-4595-8ff6-edee226cb0fd" + "colab": {} }, "source": [ "import IPython\n", @@ -1039,69 +314,7 @@ " ap.save_wav(wav, out_path)" ], "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Synthesize sentence with Speaker: p262 (this speaker seen in training)\n", - "Enter sentence: teste this demonstration :)\n", - " > Text: teste this demonstration :)\n", - " > Run-time: 2.143589496612549\n", - " > Real-time factor: 1.6784820231524382\n", - " > Time per step: 7.61217437684536e-05\n" - ], - "name": "stdout" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - } - }, - { - "output_type": "stream", - "text": [ - " > Saving output to tests-audios/teste_this_demonstration_.wav\n" - ], - "name": "stdout" - }, - { - "output_type": "error", - "ename": "KeyboardInterrupt", - "evalue": "ignored", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36m_input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 728\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 729\u001b[0;31m \u001b[0mident\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreply\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstdin_socket\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 730\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/jupyter_client/session.py\u001b[0m in \u001b[0;36mrecv\u001b[0;34m(self, socket, mode, content, copy)\u001b[0m\n\u001b[1;32m 802\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 803\u001b[0;31m \u001b[0mmsg_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msocket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_multipart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 804\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mzmq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mZMQError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/zmq/sugar/socket.py\u001b[0m in \u001b[0;36mrecv_multipart\u001b[0;34m(self, flags, copy, track)\u001b[0m\n\u001b[1;32m 474\u001b[0m \"\"\"\n\u001b[0;32m--> 475\u001b[0;31m \u001b[0mparts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mflags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrack\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrack\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 476\u001b[0m \u001b[0;31m# have first part already, only loop while more to receive\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket.Socket.recv\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket.Socket.recv\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket._recv_copy\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/zmq/backend/cython/checkrc.pxd\u001b[0m in \u001b[0;36mzmq.backend.cython.checkrc._check_rc\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: ", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Synthesize sentence with Speaker: \"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mchoise_speaker\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'_'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"(this speaker seen in training)\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mTEXT\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Enter sentence: \"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\" > Text: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mTEXT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvocoder_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTEXT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mUSE_CUDA\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_griffin_lim\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mSPEAKER_FILEID\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_embedding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_embedding\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36mraw_input\u001b[0;34m(self, prompt)\u001b[0m\n\u001b[1;32m 702\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent_ident\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 703\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent_header\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 704\u001b[0;31m \u001b[0mpassword\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 705\u001b[0m )\n\u001b[1;32m 706\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36m_input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 732\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 733\u001b[0m \u001b[0;31m# re-raise KeyboardInterrupt, to truncate traceback\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 734\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 735\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 736\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ] + "outputs": [] }, { "cell_type": "markdown", @@ -1176,11 +389,7 @@ "metadata": { "id": "5e5_XnLsx3jg", "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 704 - }, - "outputId": "3c82a3cd-f4b9-4493-aa4c-275c34d3294d" + "colab": {} }, "source": [ "import IPython\n", @@ -1200,69 +409,7 @@ " ap.save_wav(wav, out_path)" ], "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Synthesize sentence with Speaker: p244 (this speaker seen in training)\n", - "Enter sentence: Test this demonstration \n", - " > Text: Test this demonstration \n", - " > Run-time: 1.06947922706604\n", - " > Real-time factor: 0.8609054588373298\n", - " > Time per step: 3.904344461788641e-05\n" - ], - "name": "stdout" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - } - }, - { - "output_type": "stream", - "text": [ - " > Saving output to tests-audios/Test_this_demonstration_.wav\n" - ], - "name": "stdout" - }, - { - "output_type": "error", - "ename": "KeyboardInterrupt", - "evalue": "ignored", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36m_input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 728\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 729\u001b[0;31m \u001b[0mident\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreply\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstdin_socket\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 730\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/jupyter_client/session.py\u001b[0m in \u001b[0;36mrecv\u001b[0;34m(self, socket, mode, content, copy)\u001b[0m\n\u001b[1;32m 802\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 803\u001b[0;31m \u001b[0mmsg_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msocket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_multipart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 804\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mzmq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mZMQError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/zmq/sugar/socket.py\u001b[0m in \u001b[0;36mrecv_multipart\u001b[0;34m(self, flags, copy, track)\u001b[0m\n\u001b[1;32m 474\u001b[0m \"\"\"\n\u001b[0;32m--> 475\u001b[0;31m \u001b[0mparts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mflags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrack\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrack\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 476\u001b[0m \u001b[0;31m# have first part already, only loop while more to receive\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket.Socket.recv\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket.Socket.recv\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket._recv_copy\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/zmq/backend/cython/checkrc.pxd\u001b[0m in \u001b[0;36mzmq.backend.cython.checkrc._check_rc\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: ", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Synthesize sentence with Speaker: \"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mSpeaker_choise\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'_'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"(this speaker seen in training)\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mTEXT\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Enter sentence: \"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\" > Text: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mTEXT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvocoder_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTEXT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mUSE_CUDA\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_griffin_lim\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mSPEAKER_FILEID\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_embedding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_embedding\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36mraw_input\u001b[0;34m(self, prompt)\u001b[0m\n\u001b[1;32m 702\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent_ident\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 703\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent_header\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 704\u001b[0;31m \u001b[0mpassword\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 705\u001b[0m )\n\u001b[1;32m 706\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36m_input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 732\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 733\u001b[0m \u001b[0;31m# re-raise KeyboardInterrupt, to truncate traceback\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 734\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 735\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 736\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ] + "outputs": [] }, { "cell_type": "markdown", @@ -1309,11 +456,7 @@ "metadata": { "colab_type": "code", "id": "bbs85vzz4vHo", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 704 - }, - "outputId": "3c82a3cd-f4b9-4493-aa4c-275c34d3294d" + "colab": {} }, "source": [ "import IPython\n", @@ -1333,69 +476,7 @@ " ap.save_wav(wav, out_path)" ], "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Synthesize sentence with Speaker: p244 (this speaker seen in training)\n", - "Enter sentence: Test this demonstration \n", - " > Text: Test this demonstration \n", - " > Run-time: 1.06947922706604\n", - " > Real-time factor: 0.8609054588373298\n", - " > Time per step: 3.904344461788641e-05\n" - ], - "name": "stdout" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - } - }, - { - "output_type": "stream", - "text": [ - " > Saving output to tests-audios/Test_this_demonstration_.wav\n" - ], - "name": "stdout" - }, - { - "output_type": "error", - "ename": "KeyboardInterrupt", - "evalue": "ignored", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36m_input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 728\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 729\u001b[0;31m \u001b[0mident\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreply\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstdin_socket\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 730\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/jupyter_client/session.py\u001b[0m in \u001b[0;36mrecv\u001b[0;34m(self, socket, mode, content, copy)\u001b[0m\n\u001b[1;32m 802\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 803\u001b[0;31m \u001b[0mmsg_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msocket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_multipart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 804\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mzmq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mZMQError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/zmq/sugar/socket.py\u001b[0m in \u001b[0;36mrecv_multipart\u001b[0;34m(self, flags, copy, track)\u001b[0m\n\u001b[1;32m 474\u001b[0m \"\"\"\n\u001b[0;32m--> 475\u001b[0;31m \u001b[0mparts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mflags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrack\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrack\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 476\u001b[0m \u001b[0;31m# have first part already, only loop while more to receive\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket.Socket.recv\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket.Socket.recv\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket._recv_copy\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/zmq/backend/cython/checkrc.pxd\u001b[0m in \u001b[0;36mzmq.backend.cython.checkrc._check_rc\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: ", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Synthesize sentence with Speaker: \"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mSpeaker_choise\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'_'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"(this speaker seen in training)\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mTEXT\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Enter sentence: \"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\" > Text: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mTEXT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvocoder_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTEXT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mUSE_CUDA\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_griffin_lim\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mSPEAKER_FILEID\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_embedding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_embedding\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36mraw_input\u001b[0;34m(self, prompt)\u001b[0m\n\u001b[1;32m 702\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent_ident\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 703\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent_header\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 704\u001b[0;31m \u001b[0mpassword\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 705\u001b[0m )\n\u001b[1;32m 706\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36m_input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 732\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 733\u001b[0m \u001b[0;31m# re-raise KeyboardInterrupt, to truncate traceback\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 734\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 735\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 736\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ] + "outputs": [] }, { "cell_type": "markdown", @@ -1423,31 +504,14 @@ "metadata": { "id": "r0IEFZ0B5vQg", "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 121 - }, - "outputId": "93ee404d-3fdc-48ac-f867-9ef7f7ecaa0c" + "colab": {} }, "source": [ "!wget -c -q --show-progress -O ./SpeakerEncoder-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/GE2E-SpeakerEncoder-iter25k.zip\n", "!unzip ./SpeakerEncoder-checkpoint.zip" ], "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "./SpeakerEncoder-ch 100%[===================>] 25.35M 25.1MB/s in 1.0s \n", - "Archive: ./SpeakerEncoder-checkpoint.zip\n", - " creating: GE2E-SpeakerEncoder/\n", - " inflating: GE2E-SpeakerEncoder/best_model.pth.tar \n", - " inflating: GE2E-SpeakerEncoder/config.json \n", - " inflating: GE2E-SpeakerEncoder/README.MD \n" - ], - "name": "stdout" - } - ] + "outputs": [] }, { "cell_type": "code", @@ -1470,11 +534,7 @@ "metadata": { "id": "tOwkfQqT6-Qo", "colab_type": "code", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 451 - }, - "outputId": "091c9020-302a-429e-bc7e-7fd2bb6f68f9" + "colab": {} }, "source": [ "from TTS.utils.audio import AudioProcessor\n", @@ -1489,39 +549,7 @@ " se_model.cuda()" ], "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - " > Setting up Audio Processor...\n", - " | > sample_rate:16000\n", - " | > num_mels:40\n", - " | > min_level_db:-100\n", - " | > frame_shift_ms:12.5\n", - " | > frame_length_ms:50\n", - " | > ref_level_db:20\n", - " | > fft_size:1024\n", - " | > power:None\n", - " | > preemphasis:0.98\n", - " | > griffin_lim_iters:None\n", - " | > signal_norm:True\n", - " | > symmetric_norm:True\n", - " | > mel_fmin:0\n", - " | > mel_fmax:8000.0\n", - " | > spec_gain:20.0\n", - " | > stft_pad_mode:reflect\n", - " | > max_norm:4.0\n", - " | > clip_norm:True\n", - " | > do_trim_silence:False\n", - " | > trim_db:60\n", - " | > do_sound_norm:False\n", - " | > stats_path:None\n", - " | > hop_length:200\n", - " | > win_length:800\n" - ], - "name": "stdout" - } - ] + "outputs": [] }, { "cell_type": "markdown", @@ -1542,66 +570,14 @@ "metadata": { "id": "_FWwHPjJ8NXl", "colab_type": "code", - "colab": { - "resources": { - "http://localhost:8080/nbextensions/google.colab/files.js": { - "data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7CgpmdW5jdGlvbiBfdXBsb2FkRmlsZXMoaW5wdXRJZCwgb3V0cHV0SWQpIHsKICBjb25zdCBzdGVwcyA9IHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCk7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICAvLyBDYWNoZSBzdGVwcyBvbiB0aGUgb3V0cHV0RWxlbWVudCB0byBtYWtlIGl0IGF2YWlsYWJsZSBmb3IgdGhlIG5leHQgY2FsbAogIC8vIHRvIHVwbG9hZEZpbGVzQ29udGludWUgZnJvbSBQeXRob24uCiAgb3V0cHV0RWxlbWVudC5zdGVwcyA9IHN0ZXBzOwoKICByZXR1cm4gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpOwp9CgovLyBUaGlzIGlzIHJvdWdobHkgYW4gYXN5bmMgZ2VuZXJhdG9yIChub3Qgc3VwcG9ydGVkIGluIHRoZSBicm93c2VyIHlldCksCi8vIHdoZXJlIHRoZXJlIGFyZSBtdWx0aXBsZSBhc3luY2hyb25vdXMgc3RlcHMgYW5kIHRoZSBQeXRob24gc2lkZSBpcyBnb2luZwovLyB0byBwb2xsIGZvciBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcC4KLy8gVGhpcyB1c2VzIGEgUHJvbWlzZSB0byBibG9jayB0aGUgcHl0aG9uIHNpZGUgb24gY29tcGxldGlvbiBvZiBlYWNoIHN0ZXAsCi8vIHRoZW4gcGFzc2VzIHRoZSByZXN1bHQgb2YgdGhlIHByZXZpb3VzIHN0ZXAgYXMgdGhlIGlucHV0IHRvIHRoZSBuZXh0IHN0ZXAuCmZ1bmN0aW9uIF91cGxvYWRGaWxlc0NvbnRpbnVlKG91dHB1dElkKSB7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICBjb25zdCBzdGVwcyA9IG91dHB1dEVsZW1lbnQuc3RlcHM7CgogIGNvbnN0IG5leHQgPSBzdGVwcy5uZXh0KG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSk7CiAgcmV0dXJuIFByb21pc2UucmVzb2x2ZShuZXh0LnZhbHVlLnByb21pc2UpLnRoZW4oKHZhbHVlKSA9PiB7CiAgICAvLyBDYWNoZSB0aGUgbGFzdCBwcm9taXNlIHZhbHVlIHRvIG1ha2UgaXQgYXZhaWxhYmxlIHRvIHRoZSBuZXh0CiAgICAvLyBzdGVwIG9mIHRoZSBnZW5lcmF0b3IuCiAgICBvdXRwdXRFbGVtZW50Lmxhc3RQcm9taXNlVmFsdWUgPSB2YWx1ZTsKICAgIHJldHVybiBuZXh0LnZhbHVlLnJlc3BvbnNlOwogIH0pOwp9CgovKioKICogR2VuZXJhdG9yIGZ1bmN0aW9uIHdoaWNoIGlzIGNhbGxlZCBiZXR3ZWVuIGVhY2ggYXN5bmMgc3RlcCBvZiB0aGUgdXBsb2FkCiAqIHByb2Nlc3MuCiAqIEBwYXJhbSB7c3RyaW5nfSBpbnB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIGlucHV0IGZpbGUgcGlja2VyIGVsZW1lbnQuCiAqIEBwYXJhbSB7c3RyaW5nfSBvdXRwdXRJZCBFbGVtZW50IElEIG9mIHRoZSBvdXRwdXQgZGlzcGxheS4KICogQHJldHVybiB7IUl0ZXJhYmxlPCFPYmplY3Q+fSBJdGVyYWJsZSBvZiBuZXh0IHN0ZXBzLgogKi8KZnVuY3Rpb24qIHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IGlucHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKGlucHV0SWQpOwogIGlucHV0RWxlbWVudC5kaXNhYmxlZCA9IGZhbHNlOwoKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIG91dHB1dEVsZW1lbnQuaW5uZXJIVE1MID0gJyc7CgogIGNvbnN0IHBpY2tlZFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgaW5wdXRFbGVtZW50LmFkZEV2ZW50TGlzdGVuZXIoJ2NoYW5nZScsIChlKSA9PiB7CiAgICAgIHJlc29sdmUoZS50YXJnZXQuZmlsZXMpOwogICAgfSk7CiAgfSk7CgogIGNvbnN0IGNhbmNlbCA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2J1dHRvbicpOwogIGlucHV0RWxlbWVudC5wYXJlbnRFbGVtZW50LmFwcGVuZENoaWxkKGNhbmNlbCk7CiAgY2FuY2VsLnRleHRDb250ZW50ID0gJ0NhbmNlbCB1cGxvYWQnOwogIGNvbnN0IGNhbmNlbFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgY2FuY2VsLm9uY2xpY2sgPSAoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9OwogIH0pOwoKICAvLyBXYWl0IGZvciB0aGUgdXNlciB0byBwaWNrIHRoZSBmaWxlcy4KICBjb25zdCBmaWxlcyA9IHlpZWxkIHsKICAgIHByb21pc2U6IFByb21pc2UucmFjZShbcGlja2VkUHJvbWlzZSwgY2FuY2VsUHJvbWlzZV0pLAogICAgcmVzcG9uc2U6IHsKICAgICAgYWN0aW9uOiAnc3RhcnRpbmcnLAogICAgfQogIH07CgogIGNhbmNlbC5yZW1vdmUoKTsKCiAgLy8gRGlzYWJsZSB0aGUgaW5wdXQgZWxlbWVudCBzaW5jZSBmdXJ0aGVyIHBpY2tzIGFyZSBub3QgYWxsb3dlZC4KICBpbnB1dEVsZW1lbnQuZGlzYWJsZWQgPSB0cnVlOwoKICBpZiAoIWZpbGVzKSB7CiAgICByZXR1cm4gewogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgICAgfQogICAgfTsKICB9CgogIGZvciAoY29uc3QgZmlsZSBvZiBmaWxlcykgewogICAgY29uc3QgbGkgPSBkb2N1bWVudC5jcmVhdGVFbGVtZW50KCdsaScpOwogICAgbGkuYXBwZW5kKHNwYW4oZmlsZS5uYW1lLCB7Zm9udFdlaWdodDogJ2JvbGQnfSkpOwogICAgbGkuYXBwZW5kKHNwYW4oCiAgICAgICAgYCgke2ZpbGUudHlwZSB8fCAnbi9hJ30pIC0gJHtmaWxlLnNpemV9IGJ5dGVzLCBgICsKICAgICAgICBgbGFzdCBtb2RpZmllZDogJHsKICAgICAgICAgICAgZmlsZS5sYXN0TW9kaWZpZWREYXRlID8gZmlsZS5sYXN0TW9kaWZpZWREYXRlLnRvTG9jYWxlRGF0ZVN0cmluZygpIDoKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgJ24vYSd9IC0gYCkpOwogICAgY29uc3QgcGVyY2VudCA9IHNwYW4oJzAlIGRvbmUnKTsKICAgIGxpLmFwcGVuZENoaWxkKHBlcmNlbnQpOwoKICAgIG91dHB1dEVsZW1lbnQuYXBwZW5kQ2hpbGQobGkpOwoKICAgIGNvbnN0IGZpbGVEYXRhUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICAgIGNvbnN0IHJlYWRlciA9IG5ldyBGaWxlUmVhZGVyKCk7CiAgICAgIHJlYWRlci5vbmxvYWQgPSAoZSkgPT4gewogICAgICAgIHJlc29sdmUoZS50YXJnZXQucmVzdWx0KTsKICAgICAgfTsKICAgICAgcmVhZGVyLnJlYWRBc0FycmF5QnVmZmVyKGZpbGUpOwogICAgfSk7CiAgICAvLyBXYWl0IGZvciB0aGUgZGF0YSB0byBiZSByZWFkeS4KICAgIGxldCBmaWxlRGF0YSA9IHlpZWxkIHsKICAgICAgcHJvbWlzZTogZmlsZURhdGFQcm9taXNlLAogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbnRpbnVlJywKICAgICAgfQogICAgfTsKCiAgICAvLyBVc2UgYSBjaHVua2VkIHNlbmRpbmcgdG8gYXZvaWQgbWVzc2FnZSBzaXplIGxpbWl0cy4gU2VlIGIvNjIxMTU2NjAuCiAgICBsZXQgcG9zaXRpb24gPSAwOwogICAgd2hpbGUgKHBvc2l0aW9uIDwgZmlsZURhdGEuYnl0ZUxlbmd0aCkgewogICAgICBjb25zdCBsZW5ndGggPSBNYXRoLm1pbihmaWxlRGF0YS5ieXRlTGVuZ3RoIC0gcG9zaXRpb24sIE1BWF9QQVlMT0FEX1NJWkUpOwogICAgICBjb25zdCBjaHVuayA9IG5ldyBVaW50OEFycmF5KGZpbGVEYXRhLCBwb3NpdGlvbiwgbGVuZ3RoKTsKICAgICAgcG9zaXRpb24gKz0gbGVuZ3RoOwoKICAgICAgY29uc3QgYmFzZTY0ID0gYnRvYShTdHJpbmcuZnJvbUNoYXJDb2RlLmFwcGx5KG51bGwsIGNodW5rKSk7CiAgICAgIHlpZWxkIHsKICAgICAgICByZXNwb25zZTogewogICAgICAgICAgYWN0aW9uOiAnYXBwZW5kJywKICAgICAgICAgIGZpbGU6IGZpbGUubmFtZSwKICAgICAgICAgIGRhdGE6IGJhc2U2NCwKICAgICAgICB9LAogICAgICB9OwogICAgICBwZXJjZW50LnRleHRDb250ZW50ID0KICAgICAgICAgIGAke01hdGgucm91bmQoKHBvc2l0aW9uIC8gZmlsZURhdGEuYnl0ZUxlbmd0aCkgKiAxMDApfSUgZG9uZWA7CiAgICB9CiAgfQoKICAvLyBBbGwgZG9uZS4KICB5aWVsZCB7CiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICB9CiAgfTsKfQoKc2NvcGUuZ29vZ2xlID0gc2NvcGUuZ29vZ2xlIHx8IHt9OwpzY29wZS5nb29nbGUuY29sYWIgPSBzY29wZS5nb29nbGUuY29sYWIgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYi5fZmlsZXMgPSB7CiAgX3VwbG9hZEZpbGVzLAogIF91cGxvYWRGaWxlc0NvbnRpbnVlLAp9Owp9KShzZWxmKTsK", - "ok": true, - "headers": [ - [ - "content-type", - "application/javascript" - ] - ], - "status": 200, - "status_text": "OK" - } - }, - "base_uri": "https://localhost:8080/", - "height": 246 - }, - "outputId": "cb0ca2ad-73b9-44ce-d0bd-6139552232d8" + "colab": {} }, "source": [ "from google.colab import files\n", "file_list = files.upload()" ], "execution_count": null, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " Upload widget is only available when the cell has been executed in the\n", - " current browser session. Please rerun this cell to enable.\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - } - }, - { - "output_type": "stream", - "text": [ - "Saving p238_001.wav to p238_001.wav\n", - "Saving p238_002.wav to p238_002.wav\n", - "Saving p238_003.wav to p238_003.wav\n", - "Saving p238_004.wav to p238_004.wav\n", - "Saving p238_005.wav to p238_005.wav\n", - "Saving p238_006.wav to p238_006.wav\n" - ], - "name": "stdout" - } - ] + "outputs": [] }, { "cell_type": "code", @@ -1635,11 +611,7 @@ "metadata": { "colab_type": "code", "id": "xmItcGac5WiG", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 872 - }, - "outputId": "4940027a-a303-4782-ef47-0c91716665a1" + "colab": {} }, "source": [ "import IPython\n", @@ -1659,100 +631,7 @@ " ap.save_wav(wav, out_path)" ], "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Synthesize sentence with New Speaker using files: dict_keys(['p238_001.wav', 'p238_002.wav', 'p238_003.wav', 'p238_004.wav', 'p238_005.wav', 'p238_006.wav']) (this speaker not seen in training (new speaker))\n", - "Enter sentence: Test this demonstration\n", - " > Text: Test this demonstration\n", - " > Run-time: 1.1401546001434326\n", - " > Real-time factor: 0.8927653497084975\n", - " > Time per step: 4.0488326075402176e-05\n" - ], - "name": "stdout" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - } - }, - { - "output_type": "stream", - "text": [ - " > Saving output to tests-audios/Test_this_demonstration.wav\n", - "Enter sentence: Ask her to bring these things with her from the store.\n", - " > Text: Ask her to bring these things with her from the store.\n", - " > Run-time: 1.594754934310913\n", - " > Real-time factor: 0.6868010717444122\n", - " > Time per step: 3.1147492118179796e-05\n" - ], - "name": "stdout" - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - } - }, - { - "output_type": "stream", - "text": [ - " > Saving output to tests-audios/Ask_her_to_bring_these_things_with_her_from_the_store.wav\n" - ], - "name": "stdout" - }, - { - "output_type": "error", - "ename": "KeyboardInterrupt", - "evalue": "ignored", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36m_input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 728\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 729\u001b[0;31m \u001b[0mident\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreply\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstdin_socket\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 730\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/jupyter_client/session.py\u001b[0m in \u001b[0;36mrecv\u001b[0;34m(self, socket, mode, content, copy)\u001b[0m\n\u001b[1;32m 802\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 803\u001b[0;31m \u001b[0mmsg_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msocket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_multipart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 804\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mzmq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mZMQError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/zmq/sugar/socket.py\u001b[0m in \u001b[0;36mrecv_multipart\u001b[0;34m(self, flags, copy, track)\u001b[0m\n\u001b[1;32m 474\u001b[0m \"\"\"\n\u001b[0;32m--> 475\u001b[0;31m \u001b[0mparts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mflags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrack\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrack\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 476\u001b[0m \u001b[0;31m# have first part already, only loop while more to receive\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket.Socket.recv\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket.Socket.recv\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket._recv_copy\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/zmq/backend/cython/checkrc.pxd\u001b[0m in \u001b[0;36mzmq.backend.cython.checkrc._check_rc\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: ", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Synthesize sentence with New Speaker using files: \"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mfile_list\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"(this speaker not seen in training (new speaker))\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mTEXT\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Enter sentence: \"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\" > Text: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mTEXT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvocoder_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTEXT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mUSE_CUDA\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_griffin_lim\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mSPEAKER_FILEID\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_embedding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_embedding\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36mraw_input\u001b[0;34m(self, prompt)\u001b[0m\n\u001b[1;32m 702\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent_ident\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 703\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent_header\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 704\u001b[0;31m \u001b[0mpassword\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 705\u001b[0m )\n\u001b[1;32m 706\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36m_input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 732\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 733\u001b[0m \u001b[0;31m# re-raise KeyboardInterrupt, to truncate traceback\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 734\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 735\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 736\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ] + "outputs": [] } ] } \ No newline at end of file From 9d28f311689018e81998355f029f520890a19a6f Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 30 Jul 2020 23:24:37 -0300 Subject: [PATCH 38/56] add Colab Notebook from TTS Multi-Speaker with GST --- ...MultiSpeaker_jia_et_al_2018_With_GST.ipynb | 834 ++++++++++++++++++ 1 file changed, 834 insertions(+) create mode 100755 notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018_With_GST.ipynb diff --git a/notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018_With_GST.ipynb b/notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018_With_GST.ipynb new file mode 100755 index 00000000..e059461e --- /dev/null +++ b/notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018_With_GST.ipynb @@ -0,0 +1,834 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Demo-Mozilla-TTS-MultiSpeaker-jia-et-al-2018-With-GST.ipynb", + "provenance": [], + "collapsed_sections": [ + "yZK6UdwSFnOO", + "ENA2OumIVeMA", + "dV6cXXlfi72r", + "vnV-FigfvsS2", + "g_G_HweN04W-", + "LEE6mQLh5Who" + ], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "yZK6UdwSFnOO", + "colab_type": "text" + }, + "source": [ + "# **Download and install Mozilla TTS**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "yvb0pX3WY6MN", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import os \n", + "!git clone https://github.com/Edresson/TTS -b dev-gst-embeddings" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "iB9nl2UEG3SY", + "colab_type": "code", + "colab": {} + }, + "source": [ + "!apt-get install espeak\n", + "os.chdir('TTS')\n", + "!pip install -r requirements.txt\n", + "!python setup.py develop\n", + "os.chdir('..')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w6Krn8k1inC_", + "colab_type": "text" + }, + "source": [ + "\n", + "\n", + "**Download Checkpoint**\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "PiYHf3lKhi9z", + "colab_type": "code", + "colab": {} + }, + "source": [ + "!wget -c -q --show-progress -O ./TTS-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/Checkpoints-TTS-MultiSpeaker-Jia-et-al-2018-with-GST.zip\n", + "!unzip ./TTS-checkpoint.zip\n", + "\n", + "# Download gst style example\n", + "!wget https://github.com/Edresson/TTS/releases/download/v1.0.0/gst-style-example.wav" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MpYNgqrZcJKn", + "colab_type": "text" + }, + "source": [ + "**Utils Functions**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "4KZA4b_CbMqx", + "colab_type": "code", + "colab": {} + }, + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "import argparse\n", + "import json\n", + "# pylint: disable=redefined-outer-name, unused-argument\n", + "import os\n", + "import string\n", + "import time\n", + "import sys\n", + "import numpy as np\n", + "\n", + "TTS_PATH = \"../content/TTS\"\n", + "# add libraries into environment\n", + "sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n", + "\n", + "import torch\n", + "\n", + "from TTS.tts.utils.generic_utils import setup_model\n", + "from TTS.tts.utils.synthesis import synthesis\n", + "from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols\n", + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.utils.io import load_config\n", + "from TTS.vocoder.utils.generic_utils import setup_generator\n", + "\n", + "\n", + "def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None, gst_style=None):\n", + " t_1 = time.time()\n", + " waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, gst_style, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)\n", + " if CONFIG.model == \"Tacotron\" and not use_gl:\n", + " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", + " if not use_gl:\n", + " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", + " if use_cuda and not use_gl:\n", + " waveform = waveform.cpu()\n", + " if not use_gl:\n", + " waveform = waveform.numpy()\n", + " waveform = waveform.squeeze()\n", + " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", + " tps = (time.time() - t_1) / len(waveform)\n", + " print(\" > Run-time: {}\".format(time.time() - t_1))\n", + " print(\" > Real-time factor: {}\".format(rtf))\n", + " print(\" > Time per step: {}\".format(tps))\n", + " return waveform\n", + "\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ENA2OumIVeMA", + "colab_type": "text" + }, + "source": [ + "# **Vars definitions**\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "jPD0d_XpVXmY", + "colab_type": "code", + "colab": {} + }, + "source": [ + "TEXT = ''\n", + "OUT_PATH = 'tests-audios/'\n", + "# create output path\n", + "os.makedirs(OUT_PATH, exist_ok=True)\n", + "\n", + "SPEAKER_FILEID = None # if None use the first embedding from speakers.json\n", + "\n", + "# model vars \n", + "MODEL_PATH = 'best_model.pth.tar'\n", + "CONFIG_PATH = 'config.json'\n", + "SPEAKER_JSON = 'speakers.json'\n", + "\n", + "# vocoder vars\n", + "VOCODER_PATH = ''\n", + "VOCODER_CONFIG_PATH = ''\n", + "\n", + "USE_CUDA = True" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dV6cXXlfi72r", + "colab_type": "text" + }, + "source": [ + "# **Restore TTS Model**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "x1WgLFauWUPe", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# load the config\n", + "C = load_config(CONFIG_PATH)\n", + "C.forward_attn_mask = True\n", + "\n", + "# load the audio processor\n", + "ap = AudioProcessor(**C.audio)\n", + "\n", + "# if the vocabulary was passed, replace the default\n", + "if 'characters' in C.keys():\n", + " symbols, phonemes = make_symbols(**C.characters)\n", + "\n", + "speaker_embedding = None\n", + "speaker_embedding_dim = None\n", + "num_speakers = 0\n", + "# load speakers\n", + "if SPEAKER_JSON != '':\n", + " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", + " num_speakers = len(speaker_mapping)\n", + " if C.use_external_speaker_embedding_file:\n", + " if SPEAKER_FILEID is not None:\n", + " speaker_embedding = speaker_mapping[SPEAKER_FILEID]['embedding']\n", + " else: # if speaker_fileid is not specificated use the first sample in speakers.json\n", + " choise_speaker = list(speaker_mapping.keys())[0]\n", + " print(\" Speaker: \",choise_speaker.split('_')[0],'was chosen automatically', \"(this speaker seen in training)\")\n", + " speaker_embedding = speaker_mapping[choise_speaker]['embedding']\n", + " speaker_embedding_dim = len(speaker_embedding)\n", + "\n", + "# load the model\n", + "num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n", + "model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)\n", + "cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n", + "model.load_state_dict(cp['model'])\n", + "model.eval()\n", + "\n", + "if USE_CUDA:\n", + " model.cuda()\n", + "\n", + "model.decoder.set_r(cp['r'])\n", + "\n", + "# load vocoder model\n", + "if VOCODER_PATH!= \"\":\n", + " VC = load_config(VOCODER_CONFIG_PATH)\n", + " vocoder_model = setup_generator(VC)\n", + " vocoder_model.load_state_dict(torch.load(VOCODER_PATH, map_location=\"cpu\")[\"model\"])\n", + " vocoder_model.remove_weight_norm()\n", + " if USE_CUDA:\n", + " vocoder_model.cuda()\n", + " vocoder_model.eval()\n", + "else:\n", + " vocoder_model = None\n", + " VC = None\n", + "\n", + "# synthesize voice\n", + "use_griffin_lim = VOCODER_PATH== \"\"\n", + "\n", + "if not C.use_external_speaker_embedding_file:\n", + " if SPEAKER_FILEID.isdigit():\n", + " SPEAKER_FILEID = int(SPEAKER_FILEID)\n", + " else:\n", + " SPEAKER_FILEID = None\n", + "else:\n", + " SPEAKER_FILEID = None\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNvVEoE30qY6", + "colab_type": "text" + }, + "source": [ + "Synthesize sentence with Speaker\n", + "\n", + "> Stop running the cell to leave!\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "2o8fXkVSyXOa", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with Speaker: \",choise_speaker.split('_')[0], \"(this speaker seen in training)\")\n", + "gst_style = 'gst-style-example.wav'\n", + "while True:\n", + " TEXT = input(\"Enter sentence: \")\n", + " print(\" > Text: {}\".format(TEXT))\n", + " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + " # save the results\n", + " file_name = TEXT.replace(\" \", \"_\")\n", + " file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + " out_path = os.path.join(OUT_PATH, file_name)\n", + " print(\" > Saving output to {}\".format(out_path))\n", + " ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vnV-FigfvsS2", + "colab_type": "text" + }, + "source": [ + "# **Select Speaker**\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "RuCGOnJ_fgDV", + "colab_type": "code", + "colab": {} + }, + "source": [ + "\n", + "# VCTK speakers not seen in training (new speakers)\n", + "VCTK_test_Speakers = [\"p225\", \"p234\", \"p238\", \"p245\", \"p248\", \"p261\", \"p294\", \"p302\", \"p326\", \"p335\", \"p347\"]\n", + "\n", + "# VCTK speakers seen in training\n", + "VCTK_train_Speakers = ['p244', 'p300', 'p303', 'p273', 'p292', 'p252', 'p254', 'p269', 'p345', 'p274', 'p363', 'p285', 'p351', 'p361', 'p295', 'p266', 'p307', 'p230', 'p339', 'p253', 'p310', 'p241', 'p256', 'p323', 'p237', 'p229', 'p298', 'p336', 'p276', 'p305', 'p255', 'p278', 'p299', 'p265', 'p267', 'p280', 'p260', 'p272', 'p262', 'p334', 'p283', 'p247', 'p246', 'p374', 'p297', 'p249', 'p250', 'p304', 'p240', 'p236', 'p312', 'p286', 'p263', 'p258', 'p313', 'p376', 'p279', 'p340', 'p362', 'p284', 'p231', 'p308', 'p277', 'p275', 'p333', 'p314', 'p330', 'p264', 'p226', 'p288', 'p343', 'p239', 'p232', 'p268', 'p270', 'p329', 'p227', 'p271', 'p228', 'p311', 'p301', 'p293', 'p364', 'p251', 'p317', 'p360', 'p281', 'p243', 'p287', 'p233', 'p259', 'p316', 'p257', 'p282', 'p306', 'p341', 'p318']\n", + "\n", + "\n", + "num_samples_speaker = 2 # In theory the more samples of the speaker the more similar to the real voice it will be!\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hkvv7gRcx4WV", + "colab_type": "text" + }, + "source": [ + "## **Example select a VCTK seen speaker in training**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "BviNMI9UyCYz", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# get embedding\n", + "Speaker_choise = VCTK_train_Speakers[0] # choise one of training speakers\n", + "# load speakers\n", + "if SPEAKER_JSON != '':\n", + " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", + " if C.use_external_speaker_embedding_file:\n", + " speaker_embeddings = []\n", + " for key in list(speaker_mapping.keys()):\n", + " if Speaker_choise in key:\n", + " if len(speaker_embeddings) < num_samples_speaker:\n", + " speaker_embeddings.append(speaker_mapping[key]['embedding'])\n", + " # takes the average of the embedings samples of the announcers\n", + " speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n", + " " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "5e5_XnLsx3jg", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker seen in training)\")\n", + "gst_style = 'gst-style-example.wav'\n", + "while True:\n", + " TEXT = input(\"Enter sentence: \")\n", + " print(\" > Text: {}\".format(TEXT))\n", + " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + " # save the results\n", + " file_name = TEXT.replace(\" \", \"_\")\n", + " file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + " out_path = os.path.join(OUT_PATH, file_name)\n", + " print(\" > Saving output to {}\".format(out_path))\n", + " ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "QJ6VgT2a4vHW" + }, + "source": [ + "## **Example select a VCTK not seen speaker in training (new Speakers)**\n", + "\n", + "\n", + "> Fitting new Speakers :)\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "SZS57ZK-4vHa", + "colab": {} + }, + "source": [ + "# get embedding\n", + "Speaker_choise = VCTK_test_Speakers[0] # choise one of training speakers\n", + "# load speakers\n", + "if SPEAKER_JSON != '':\n", + " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", + " if C.use_external_speaker_embedding_file:\n", + " speaker_embeddings = []\n", + " for key in list(speaker_mapping.keys()):\n", + " if Speaker_choise in key:\n", + " if len(speaker_embeddings) < num_samples_speaker:\n", + " speaker_embeddings.append(speaker_mapping[key]['embedding'])\n", + " # takes the average of the embedings samples of the announcers\n", + " speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n", + " " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "bbs85vzz4vHo", + "colab": {} + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", + "gst_style = 'gst-style-example.wav'\n", + "while True:\n", + " TEXT = input(\"Enter sentence: \")\n", + " print(\" > Text: {}\".format(TEXT))\n", + " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + " # save the results\n", + " file_name = TEXT.replace(\" \", \"_\")\n", + " file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + " out_path = os.path.join(OUT_PATH, file_name)\n", + " print(\" > Saving output to {}\".format(out_path))\n", + " ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "g_G_HweN04W-", + "colab_type": "text" + }, + "source": [ + "# **Changing GST tokens manually (without wav reference)**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jyFP5syW2bjt", + "colab_type": "text" + }, + "source": [ + "You can define tokens manually, this way you can increase/decrease the function of a given GST token. For example a token is responsible for the length of the speaker's pauses, if you increase the value of that token you will have longer pauses and if you decrease it you will have shorter pauses." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "SpwjDjCM2a3Y", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# set gst tokens, in this model we have 5 tokens\n", + "gst_style = {\"0\": 0, \"1\": 0, \"3\": 0, \"4\": 0}" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "qWChMbI_0z5X", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", + "TEXT = input(\"Enter sentence: \")\n", + "print(\" > Text: {}\".format(TEXT))\n", + "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + "# save the results\n", + "file_name = TEXT.replace(\" \", \"_\")\n", + "file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + "out_path = os.path.join(OUT_PATH, file_name)\n", + "print(\" > Saving output to {}\".format(out_path))\n", + "ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "uFjUi9xQ3mG3", + "colab_type": "code", + "colab": {} + }, + "source": [ + "gst_style = {\"0\": 0.9, \"1\": 0, \"3\": 0, \"4\": 0}\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", + "TEXT = input(\"Enter sentence: \")\n", + "print(\" > Text: {}\".format(TEXT))\n", + "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + "# save the results\n", + "file_name = TEXT.replace(\" \", \"_\")\n", + "file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + "out_path = os.path.join(OUT_PATH, file_name)\n", + "print(\" > Saving output to {}\".format(out_path))\n", + "ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Uw0d6gWg4L27", + "colab_type": "code", + "colab": {} + }, + "source": [ + "gst_style = {\"0\": -0.9, \"1\": 0, \"3\": 0, \"4\": 0}\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", + "TEXT = input(\"Enter sentence: \")\n", + "print(\" > Text: {}\".format(TEXT))\n", + "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + "# save the results\n", + "file_name = TEXT.replace(\" \", \"_\")\n", + "file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + "out_path = os.path.join(OUT_PATH, file_name)\n", + "print(\" > Saving output to {}\".format(out_path))\n", + "ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "V9izw4-54-Tl", + "colab_type": "code", + "colab": {} + }, + "source": [ + "gst_style = {\"0\": 0, \"1\": 0.9, \"3\": 0, \"4\": 0}\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", + "TEXT = input(\"Enter sentence: \")\n", + "print(\" > Text: {}\".format(TEXT))\n", + "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + "# save the results\n", + "file_name = TEXT.replace(\" \", \"_\")\n", + "file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + "out_path = os.path.join(OUT_PATH, file_name)\n", + "print(\" > Saving output to {}\".format(out_path))\n", + "ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "LEE6mQLh5Who" + }, + "source": [ + "# **Example Synthesizing with your own voice :)**\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "La70gSB65nrs", + "colab_type": "text" + }, + "source": [ + " Download and load GE2E Speaker Encoder " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "r0IEFZ0B5vQg", + "colab_type": "code", + "colab": {} + }, + "source": [ + "!wget -c -q --show-progress -O ./SpeakerEncoder-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/GE2E-SpeakerEncoder-iter25k.zip\n", + "!unzip ./SpeakerEncoder-checkpoint.zip" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "jEH8HCTh5mF6", + "colab_type": "code", + "colab": {} + }, + "source": [ + "SE_MODEL_RUN_PATH = \"GE2E-SpeakerEncoder/\"\n", + "SE_MODEL_PATH = os.path.join(SE_MODEL_RUN_PATH, \"best_model.pth.tar\")\n", + "SE_CONFIG_PATH =os.path.join(SE_MODEL_RUN_PATH, \"config.json\")\n", + "USE_CUDA = True" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "tOwkfQqT6-Qo", + "colab_type": "code", + "colab": {} + }, + "source": [ + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.speaker_encoder.model import SpeakerEncoder\n", + "se_config = load_config(SE_CONFIG_PATH)\n", + "se_ap = AudioProcessor(**se_config['audio'])\n", + "\n", + "se_model = SpeakerEncoder(**se_config.model)\n", + "se_model.load_state_dict(torch.load(SE_MODEL_PATH)['model'])\n", + "se_model.eval()\n", + "if USE_CUDA:\n", + " se_model.cuda()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0TLlbUFG8O36", + "colab_type": "text" + }, + "source": [ + "Upload one or more wav audio files in your voice.\n", + "\n", + "\n", + "> We recommend files longer than 3 seconds, the bigger the file the closer to your voice :)\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "_FWwHPjJ8NXl", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# select one or more wav files\n", + "from google.colab import files\n", + "file_list = files.upload()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "WWOf6sgbBbGY", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# extract embedding from wav files\n", + "speaker_embeddings = []\n", + "for name in file_list.keys():\n", + " if '.wav' in name:\n", + " mel_spec = se_ap.melspectrogram(se_ap.load_wav(name, sr=se_ap.sample_rate)).T\n", + " mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n", + " if USE_CUDA:\n", + " mel_spec = mel_spec.cuda()\n", + " embedd = se_model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n", + " speaker_embeddings.append(embedd)\n", + " else:\n", + " print(\"You need upload Wav files, others files is not supported !!\")\n", + "\n", + "# takes the average of the embedings samples of the announcers\n", + "speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "AQ7eP31d9yzq", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n", + "gst_style = {\"0\": 0, \"1\": 0.0, \"3\": 0, \"4\": 0}\n", + "gst_style = 'gst-style-example.wav'\n", + "TEXT = input(\"Enter sentence: \")\n", + "print(\" > Text: {}\".format(TEXT))\n", + "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + "# save the results\n", + "file_name = TEXT.replace(\" \", \"_\")\n", + "file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + "out_path = os.path.join(OUT_PATH, file_name)\n", + "print(\" > Saving output to {}\".format(out_path))\n", + "ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "11i10yE1-LMJ", + "colab_type": "text" + }, + "source": [ + "Uploading your own GST reference wav file" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "eKohSQG1-KkT", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# select one wav file for GST reference\n", + "from google.colab import files\n", + "file_list = files.upload()\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "xmItcGac5WiG", + "colab": {} + }, + "source": [ + "print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n", + "gst_style = list(file_list.keys())[0]\n", + "TEXT = input(\"Enter sentence: \")\n", + "print(\" > Text: {}\".format(TEXT))\n", + "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + "# save the results\n", + "file_name = TEXT.replace(\" \", \"_\")\n", + "file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + "out_path = os.path.join(OUT_PATH, file_name)\n", + "print(\" > Saving output to {}\".format(out_path))\n", + "ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file From ad570f3cab816bd5245a15788b890a72dd11b8a5 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 30 Jul 2020 23:29:27 -0300 Subject: [PATCH 39/56] add Jupyter Notebook for Extract Speaker Embedding per sample using GE2E --- ...- ExtractSpeakerEmbeddings-by-sample.ipynb | 163 ++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb diff --git a/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb b/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb new file mode 100644 index 00000000..612ec146 --- /dev/null +++ b/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb @@ -0,0 +1,163 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is a noteboook used to generate the speaker embeddings with the GE2E speaker encoder model for multi-speaker training.\n", + "\n", + "Before running this script please DON'T FORGET: \n", + "- to set file paths.\n", + "- to download related model files from TTS.\n", + "- download or clone related repos, linked below.\n", + "- setup the repositories. ```python setup.py install```\n", + "- to checkout right commit versions (given next to the model) of TTS.\n", + "- to set the right paths in the cell below.\n", + "\n", + "Repository:\n", + "- TTS: https://github.com/mozilla/TTS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "import os\n", + "import importlib\n", + "import random\n", + "import librosa\n", + "import torch\n", + "\n", + "import numpy as np\n", + "from tqdm import tqdm\n", + "from TTS.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", + "\n", + "# you may need to change this depending on your system\n", + "os.environ['CUDA_VISIBLE_DEVICES']='0'\n", + "\n", + "\n", + "from TTS.speaker_encoder.model import SpeakerEncoder\n", + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.utils.generic_utils import load_config" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should also adjust all the path constants to point at the relevant locations for you locally" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MODEL_RUN_PATH = \"../../Mozilla-TTS/checkpoints/libritts_360-half-September-28-2019_10+46AM-8565c50-20200323T115637Z-001/\"\n", + "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n", + "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n", + "\n", + "\n", + "DATASETS_NAME = ['brspeech'] # list the datasets\n", + "DATASETS_PATH = ['../../../datasets/BRSpeech-2.0-beta8']\n", + "DATASETS_METAFILE = ['TTS_metadata_brspeech2+cv_all_valited_lines.csv']\n", + "\n", + "USE_CUDA = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Preprocess dataset\n", + "meta_data = []\n", + "for i in range(len(DATASETS_NAME)):\n", + " preprocessor = importlib.import_module('TTS.datasets.preprocess')\n", + " preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n", + " meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n", + " \n", + "meta_data= list(meta_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "c = load_config(CONFIG_PATH)\n", + "ap = AudioProcessor(**c['audio'])\n", + "\n", + "model = SpeakerEncoder(**c.model)\n", + "model.load_state_dict(torch.load(MODEL_PATH)['model'])\n", + "model.eval()\n", + "if USE_CUDA:\n", + " model.cuda()\n", + "\n", + "embeddings_dict = {}\n", + "len_meta_data= len(meta_data)\n", + "\n", + "for i in tqdm(range(len_meta_data)):\n", + " _, wav_file, speaker_id = meta_data[i]\n", + " wav_file_name = os.path.basename(wav_file)\n", + " mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T\n", + " mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n", + " if USE_CUDA:\n", + " mel_spec = mel_spec.cuda()\n", + " embedd = model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n", + " embeddings_dict[wav_file_name] = [embedd,speaker_id]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create and export speakers.json\n", + "speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0].reshape(-1).tolist()} for i, sample in enumerate(embeddings_dict.keys())}\n", + "save_speaker_mapping(MODEL_RUN_PATH, speaker_mapping)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#test load integrity\n", + "speaker_mapping_load = load_speaker_mapping(MODEL_RUN_PATH)\n", + "assert speaker_mapping == speaker_mapping_load\n", + "print(\"The file speakers.json has been exported to \",MODEL_RUN_PATH, ' with ', len(embeddings_dict.keys()), ' speakers')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 8b9c951da799fe202105221f70b2c2d66697fa58 Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 30 Jul 2020 23:33:57 -0300 Subject: [PATCH 40/56] Ops! Map Notebooks imports for TTS current version --- ...peaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb b/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb index 612ec146..324de2d0 100644 --- a/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb +++ b/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb @@ -34,15 +34,15 @@ "\n", "import numpy as np\n", "from tqdm import tqdm\n", - "from TTS.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", + "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", "\n", "# you may need to change this depending on your system\n", "os.environ['CUDA_VISIBLE_DEVICES']='0'\n", "\n", "\n", - "from TTS.speaker_encoder.model import SpeakerEncoder\n", + "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.generic_utils import load_config" + "from TTS.utils.io import load_config" ] }, { From f0bcc390d24730e31656ded6f6df88ae765bc8ac Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 31 Jul 2020 00:05:08 -0300 Subject: [PATCH 41/56] Implement Angular Prototypical loss --- mozilla_voice_tts/speaker_encoder/loss.py | 42 +++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/mozilla_voice_tts/speaker_encoder/loss.py b/mozilla_voice_tts/speaker_encoder/loss.py index ab290547..6f83be63 100644 --- a/mozilla_voice_tts/speaker_encoder/loss.py +++ b/mozilla_voice_tts/speaker_encoder/loss.py @@ -23,6 +23,8 @@ class GE2ELoss(nn.Module): self.b = nn.Parameter(torch.tensor(init_b)) self.loss_method = loss_method + print('Initialised Generalized End-to-End loss') + assert self.loss_method in ["softmax", "contrast"] if self.loss_method == "softmax": @@ -119,3 +121,43 @@ class GE2ELoss(nn.Module): cos_sim_matrix = self.w * cos_sim_matrix + self.b L = self.embed_loss(dvecs, cos_sim_matrix) return L.mean() + +# adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py +class AngleProtoLoss(nn.Module): + """ + Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982 + Accepts an input of size (N, M, D) + where N is the number of speakers in the batch, + M is the number of utterances per speaker, + and D is the dimensionality of the embedding vector + Args: + - init_w (float): defines the initial value of w + - init_b (float): definies the initial value of b + """ + def __init__(self, init_w=10.0, init_b=-5.0): + super(AngleProtoLoss, self).__init__() + # pylint: disable=E1102 + self.w = nn.Parameter(torch.tensor(init_w)) + # pylint: disable=E1102 + self.b = nn.Parameter(torch.tensor(init_b)) + self.criterion = torch.nn.CrossEntropyLoss() + self.use_cuda = torch.cuda.is_available() + + print('Initialised Angular Prototypical loss') + + def forward(self, x): + """ + Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) + """ + out_anchor = torch.mean(x[:,1:,:],1) + out_positive = x[:,0,:] + num_speakers = out_anchor.size()[0] + + cos_sim_matrix = F.cosine_similarity(out_positive.unsqueeze(-1).expand(-1,-1,num_speakers),out_anchor.unsqueeze(-1).expand(-1,-1,num_speakers).transpose(0,2)) + torch.clamp(self.w, 1e-6) + cos_sim_matrix = cos_sim_matrix * self.w + self.b + label = torch.from_numpy(np.asarray(range(0,num_speakers))) + if self.use_cuda: + label = label.cuda() + L = self.criterion(cos_sim_matrix, label) + return L \ No newline at end of file From bc09ca81116e25aa1d7250c6a8418b6f10a59ae8 Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 31 Jul 2020 00:24:42 -0300 Subject: [PATCH 42/56] add suport for AngleProto loss --- mozilla_voice_tts/bin/train_encoder.py | 11 +- mozilla_voice_tts/speaker_encoder/config.json | 1 + .../speaker_encoder/generic_utils.py | 4 +- mozilla_voice_tts/speaker_encoder/losses.py | 160 ++++++++++++++++++ 4 files changed, 171 insertions(+), 5 deletions(-) create mode 100644 mozilla_voice_tts/speaker_encoder/losses.py diff --git a/mozilla_voice_tts/bin/train_encoder.py b/mozilla_voice_tts/bin/train_encoder.py index d612ac6e..c7c2e647 100644 --- a/mozilla_voice_tts/bin/train_encoder.py +++ b/mozilla_voice_tts/bin/train_encoder.py @@ -100,7 +100,7 @@ def train(model, criterion, optimizer, scheduler, ap, global_step): if global_step % c.steps_plot_stats == 0: # Plot Training Epoch Stats train_stats = { - "GE2Eloss": avg_loss, + "loss": avg_loss, "lr": current_lr, "grad_norm": grad_norm, "step_time": step_time @@ -140,7 +140,13 @@ def main(args): # pylint: disable=redefined-outer-name lstm_dim=384, num_lstm_layers=3) optimizer = RAdam(model.parameters(), lr=c.lr) - criterion = GE2ELoss(loss_method='softmax') + + if c.loss == "ge2e": + criterion = GE2ELoss(loss_method='softmax') + elif c.loss == "angleproto": + criterion = AngleProtoLoss() + else: + raise Exception("The %s not is a loss supported" %c.loss) if args.restore_path: checkpoint = torch.load(args.restore_path) @@ -186,7 +192,6 @@ def main(args): # pylint: disable=redefined-outer-name _, global_step = train(model, criterion, optimizer, scheduler, ap, global_step) - if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( diff --git a/mozilla_voice_tts/speaker_encoder/config.json b/mozilla_voice_tts/speaker_encoder/config.json index 0d0f8f68..5f72135f 100644 --- a/mozilla_voice_tts/speaker_encoder/config.json +++ b/mozilla_voice_tts/speaker_encoder/config.json @@ -21,6 +21,7 @@ "do_trim_silence": false // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) }, "reinit_layers": [], + "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) "grad_clip": 3.0, // upper limit for gradients for clipping. "epochs": 1000, // total number of epochs to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. diff --git a/mozilla_voice_tts/speaker_encoder/generic_utils.py b/mozilla_voice_tts/speaker_encoder/generic_utils.py index f649ceb9..bc72c91c 100644 --- a/mozilla_voice_tts/speaker_encoder/generic_utils.py +++ b/mozilla_voice_tts/speaker_encoder/generic_utils.py @@ -15,7 +15,7 @@ def save_checkpoint(model, optimizer, model_loss, out_path, 'optimizer': optimizer.state_dict() if optimizer is not None else None, 'step': current_step, 'epoch': epoch, - 'GE2Eloss': model_loss, + 'loss': model_loss, 'date': datetime.date.today().strftime("%B %d, %Y"), } torch.save(state, checkpoint_path) @@ -29,7 +29,7 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path, 'model': new_state_dict, 'optimizer': optimizer.state_dict(), 'step': current_step, - 'GE2Eloss': model_loss, + 'loss': model_loss, 'date': datetime.date.today().strftime("%B %d, %Y"), } best_loss = model_loss diff --git a/mozilla_voice_tts/speaker_encoder/losses.py b/mozilla_voice_tts/speaker_encoder/losses.py new file mode 100644 index 00000000..7feced64 --- /dev/null +++ b/mozilla_voice_tts/speaker_encoder/losses.py @@ -0,0 +1,160 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +# adapted from https://github.com/cvqluu/GE2E-Loss +class GE2ELoss(nn.Module): + def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"): + """ + Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1] + Accepts an input of size (N, M, D) + where N is the number of speakers in the batch, + M is the number of utterances per speaker, + and D is the dimensionality of the embedding vector (e.g. d-vector) + Args: + - init_w (float): defines the initial value of w in Equation (5) of [1] + - init_b (float): definies the initial value of b in Equation (5) of [1] + """ + super(GE2ELoss, self).__init__() + # pylint: disable=E1102 + self.w = nn.Parameter(torch.tensor(init_w)) + # pylint: disable=E1102 + self.b = nn.Parameter(torch.tensor(init_b)) + self.loss_method = loss_method + + print('Initialised Generalized End-to-End loss') + + assert self.loss_method in ["softmax", "contrast"] + + if self.loss_method == "softmax": + self.embed_loss = self.embed_loss_softmax + if self.loss_method == "contrast": + self.embed_loss = self.embed_loss_contrast + + # pylint: disable=R0201 + def calc_new_centroids(self, dvecs, centroids, spkr, utt): + """ + Calculates the new centroids excluding the reference utterance + """ + excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :])) + excl = torch.mean(excl, 0) + new_centroids = [] + for i, centroid in enumerate(centroids): + if i == spkr: + new_centroids.append(excl) + else: + new_centroids.append(centroid) + return torch.stack(new_centroids) + + def calc_cosine_sim(self, dvecs, centroids): + """ + Make the cosine similarity matrix with dims (N,M,N) + """ + cos_sim_matrix = [] + for spkr_idx, speaker in enumerate(dvecs): + cs_row = [] + for utt_idx, utterance in enumerate(speaker): + new_centroids = self.calc_new_centroids( + dvecs, centroids, spkr_idx, utt_idx + ) + # vector based cosine similarity for speed + cs_row.append( + torch.clamp( + torch.mm( + utterance.unsqueeze(1).transpose(0, 1), + new_centroids.transpose(0, 1), + ) + / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)), + 1e-6, + ) + ) + cs_row = torch.cat(cs_row, dim=0) + cos_sim_matrix.append(cs_row) + return torch.stack(cos_sim_matrix) + + # pylint: disable=R0201 + def embed_loss_softmax(self, dvecs, cos_sim_matrix): + """ + Calculates the loss on each embedding $L(e_{ji})$ by taking softmax + """ + N, M, _ = dvecs.shape + L = [] + for j in range(N): + L_row = [] + for i in range(M): + L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j]) + L_row = torch.stack(L_row) + L.append(L_row) + return torch.stack(L) + + # pylint: disable=R0201 + def embed_loss_contrast(self, dvecs, cos_sim_matrix): + """ + Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid + """ + N, M, _ = dvecs.shape + L = [] + for j in range(N): + L_row = [] + for i in range(M): + centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i]) + excl_centroids_sigmoids = torch.cat( + (centroids_sigmoids[:j], centroids_sigmoids[j + 1 :]) + ) + L_row.append( + 1.0 + - torch.sigmoid(cos_sim_matrix[j, i, j]) + + torch.max(excl_centroids_sigmoids) + ) + L_row = torch.stack(L_row) + L.append(L_row) + return torch.stack(L) + + def forward(self, dvecs): + """ + Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) + """ + centroids = torch.mean(dvecs, 1) + cos_sim_matrix = self.calc_cosine_sim(dvecs, centroids) + torch.clamp(self.w, 1e-6) + cos_sim_matrix = self.w * cos_sim_matrix + self.b + L = self.embed_loss(dvecs, cos_sim_matrix) + return L.mean() + +# adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py +class AngleProtoLoss(nn.Module): + """ + Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982 + Accepts an input of size (N, M, D) + where N is the number of speakers in the batch, + M is the number of utterances per speaker, + and D is the dimensionality of the embedding vector + Args: + - init_w (float): defines the initial value of w + - init_b (float): definies the initial value of b + """ + def __init__(self, init_w=10.0, init_b=-5.0): + super(AngleProtoLoss, self).__init__() + # pylint: disable=E1102 + self.w = nn.Parameter(torch.tensor(init_w)) + # pylint: disable=E1102 + self.b = nn.Parameter(torch.tensor(init_b)) + self.criterion = torch.nn.CrossEntropyLoss() + + print('Initialised Angular Prototypical loss') + + def forward(self, x): + """ + Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) + """ + out_anchor = torch.mean(x[:,1:,:],1) + out_positive = x[:,0,:] + num_speakers = out_anchor.size()[0] + + cos_sim_matrix = F.cosine_similarity(out_positive.unsqueeze(-1).expand(-1,-1,num_speakers),out_anchor.unsqueeze(-1).expand(-1,-1,num_speakers).transpose(0,2)) + torch.clamp(self.w, 1e-6) + cos_sim_matrix = cos_sim_matrix * self.w + self.b + label = torch.from_numpy(np.asarray(range(0,num_speakers))).to(cos_sim_matrix.device) + L = self.criterion(cos_sim_matrix, label) + return L \ No newline at end of file From ac85ccae994b6291cf003cb87d3e5e365ece8aeb Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 31 Jul 2020 00:27:51 -0300 Subject: [PATCH 43/56] update speaker encoder config.json, for compatibility with the TTS model --- mozilla_voice_tts/speaker_encoder/config.json | 49 ++++++++++--------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/mozilla_voice_tts/speaker_encoder/config.json b/mozilla_voice_tts/speaker_encoder/config.json index 5f72135f..f9166066 100644 --- a/mozilla_voice_tts/speaker_encoder/config.json +++ b/mozilla_voice_tts/speaker_encoder/config.json @@ -1,24 +1,30 @@ + { - "run_name": "libritts_360-half", - "run_description": "train speaker encoder for libritts 360", - "audio": { + "run_name": "libritts_100+360-angleproto", + "run_description": "train speaker encoder for libritts 100 and 360", + "audio":{ // Audio processing parameters - "num_mels": 40, // size of the mel spec frame. - "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "frame_length_ms": 50, // stft window length in ms. - "frame_shift_ms": 12.5, // stft window hop-lengh in ms. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + "num_mels": 80, // size of the mel spec frame. + "num_freq": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "min_level_db": -100, // normalization range + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] + "signal_norm": true, // normalize the spec values in range [0, 1] "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": false // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60 // threshold for timming silence. Set this according to your dataset. }, "reinit_layers": [], "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) @@ -34,10 +40,9 @@ "checkpoint": true, // If true, it saves checkpoints per "save_step" "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. "print_step": 1, // Number of steps to log traning on console. - "output_path": "/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. - "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. "model": { - "input_dim": 40, + "input_dim": 80, // input_dim == num_mels "proj_dim": 128, "lstm_dim": 384, "num_lstm_layers": 3 @@ -46,13 +51,13 @@ [ { "name": "libri_tts", - "path": "/home/erogol/Data/Libri-TTS/train-clean-360/", + "path": "../../datasets/LibriTTS/train-clean-360/", "meta_file_train": null, "meta_file_val": null }, { "name": "libri_tts", - "path": "/home/erogol/Data/Libri-TTS/train-clean-100/", + "path": "../../datasets/LibriTTS/train-clean-100/", "meta_file_train": null, "meta_file_val": null } From 5e11d81e1221afaa435e3130a0e4d1084ea828ec Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 31 Jul 2020 00:39:08 -0300 Subject: [PATCH 44/56] fix broken imports in speaker encoder --- .../config.json | 60 +++++++++++++++++++ .../config.json | 60 +++++++++++++++++++ mozilla_voice_tts/bin/train_encoder.py | 2 +- mozilla_voice_tts/speaker_encoder/config.json | 10 +--- 4 files changed, 123 insertions(+), 9 deletions(-) create mode 100644 checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+37AM-debug/config.json create mode 100644 checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+38AM-debug/config.json diff --git a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+37AM-debug/config.json b/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+37AM-debug/config.json new file mode 100644 index 00000000..dbea7ef2 --- /dev/null +++ b/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+37AM-debug/config.json @@ -0,0 +1,60 @@ + +"github_branch":"* dev-gst-embeddings", +{ + "run_name": "libritts_100+360-angleproto", + "run_description": "train speaker encoder for libritts 100 and 360", + "audio":{ + // Audio processing parameters + "num_mels": 80, // size of the mel spec frame. + "num_freq": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "min_level_db": -100, // normalization range + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + // Normalization parameters + "signal_norm": true, // normalize the spec values in range [0, 1] + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60 // threshold for timming silence. Set this according to your dataset. + }, + "reinit_layers": [], + "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) + "grad_clip": 3.0, // upper limit for gradients for clipping. + "epochs": 1000, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "lr_decay": false, // if true, Noam learning rate decaying is applied through training. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + "steps_plot_stats": 10, // number of steps to plot embeddings. + "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "wd": 0.000001, // Weight decay weight. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. + "print_step": 1, // Number of steps to log traning on console. + "output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. + "model": { + "input_dim": 80, // input_dim == num_mels + "proj_dim": 128, + "lstm_dim": 384, + "num_lstm_layers": 3 + }, + "datasets": + [ + { + "name": "vctk", + "path": "../../../datasets/VCTK-Corpus-removed-silence/", + "meta_file_train": null, + "meta_file_val": null + } + ] +} \ No newline at end of file diff --git a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+38AM-debug/config.json b/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+38AM-debug/config.json new file mode 100644 index 00000000..dbea7ef2 --- /dev/null +++ b/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+38AM-debug/config.json @@ -0,0 +1,60 @@ + +"github_branch":"* dev-gst-embeddings", +{ + "run_name": "libritts_100+360-angleproto", + "run_description": "train speaker encoder for libritts 100 and 360", + "audio":{ + // Audio processing parameters + "num_mels": 80, // size of the mel spec frame. + "num_freq": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "min_level_db": -100, // normalization range + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + // Normalization parameters + "signal_norm": true, // normalize the spec values in range [0, 1] + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60 // threshold for timming silence. Set this according to your dataset. + }, + "reinit_layers": [], + "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) + "grad_clip": 3.0, // upper limit for gradients for clipping. + "epochs": 1000, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "lr_decay": false, // if true, Noam learning rate decaying is applied through training. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + "steps_plot_stats": 10, // number of steps to plot embeddings. + "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "wd": 0.000001, // Weight decay weight. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. + "print_step": 1, // Number of steps to log traning on console. + "output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. + "model": { + "input_dim": 80, // input_dim == num_mels + "proj_dim": 128, + "lstm_dim": 384, + "num_lstm_layers": 3 + }, + "datasets": + [ + { + "name": "vctk", + "path": "../../../datasets/VCTK-Corpus-removed-silence/", + "meta_file_train": null, + "meta_file_val": null + } + ] +} \ No newline at end of file diff --git a/mozilla_voice_tts/bin/train_encoder.py b/mozilla_voice_tts/bin/train_encoder.py index c7c2e647..1352a02e 100644 --- a/mozilla_voice_tts/bin/train_encoder.py +++ b/mozilla_voice_tts/bin/train_encoder.py @@ -13,7 +13,7 @@ from torch.utils.data import DataLoader from mozilla_voice_tts.generic_utils import count_parameters from mozilla_voice_tts.speaker_encoder.dataset import MyDataset from mozilla_voice_tts.speaker_encoder.generic_utils import save_best_model -from mozilla_voice_tts.speaker_encoder.loss import GE2ELoss +from mozilla_voice_tts.speaker_encoder.losses import GE2ELoss from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder from mozilla_voice_tts.speaker_encoder.visual import plot_embeddings from mozilla_voice_tts.tts.datasets.preprocess import load_meta_data diff --git a/mozilla_voice_tts/speaker_encoder/config.json b/mozilla_voice_tts/speaker_encoder/config.json index f9166066..281578da 100644 --- a/mozilla_voice_tts/speaker_encoder/config.json +++ b/mozilla_voice_tts/speaker_encoder/config.json @@ -50,14 +50,8 @@ "datasets": [ { - "name": "libri_tts", - "path": "../../datasets/LibriTTS/train-clean-360/", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "libri_tts", - "path": "../../datasets/LibriTTS/train-clean-100/", + "name": "vctk", + "path": "../../../datasets/VCTK-Corpus-removed-silence/", "meta_file_train": null, "meta_file_val": null } From cb212b804950ca3112bba8d236b0b48b861ec4ab Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 31 Jul 2020 00:54:45 -0300 Subject: [PATCH 45/56] update deprecated functions call from speaker encoder --- .../config.json | 60 +++++++++++++++++++ mozilla_voice_tts/bin/train_encoder.py | 2 +- mozilla_voice_tts/speaker_encoder/config.json | 1 + mozilla_voice_tts/speaker_encoder/dataset.py | 2 +- 4 files changed, 63 insertions(+), 2 deletions(-) create mode 100644 checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+40AM-debug/config.json diff --git a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+40AM-debug/config.json b/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+40AM-debug/config.json new file mode 100644 index 00000000..dbea7ef2 --- /dev/null +++ b/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+40AM-debug/config.json @@ -0,0 +1,60 @@ + +"github_branch":"* dev-gst-embeddings", +{ + "run_name": "libritts_100+360-angleproto", + "run_description": "train speaker encoder for libritts 100 and 360", + "audio":{ + // Audio processing parameters + "num_mels": 80, // size of the mel spec frame. + "num_freq": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "min_level_db": -100, // normalization range + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + // Normalization parameters + "signal_norm": true, // normalize the spec values in range [0, 1] + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60 // threshold for timming silence. Set this according to your dataset. + }, + "reinit_layers": [], + "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) + "grad_clip": 3.0, // upper limit for gradients for clipping. + "epochs": 1000, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "lr_decay": false, // if true, Noam learning rate decaying is applied through training. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + "steps_plot_stats": 10, // number of steps to plot embeddings. + "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "wd": 0.000001, // Weight decay weight. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. + "print_step": 1, // Number of steps to log traning on console. + "output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. + "model": { + "input_dim": 80, // input_dim == num_mels + "proj_dim": 128, + "lstm_dim": 384, + "num_lstm_layers": 3 + }, + "datasets": + [ + { + "name": "vctk", + "path": "../../../datasets/VCTK-Corpus-removed-silence/", + "meta_file_train": null, + "meta_file_val": null + } + ] +} \ No newline at end of file diff --git a/mozilla_voice_tts/bin/train_encoder.py b/mozilla_voice_tts/bin/train_encoder.py index 1352a02e..46b7f46f 100644 --- a/mozilla_voice_tts/bin/train_encoder.py +++ b/mozilla_voice_tts/bin/train_encoder.py @@ -247,7 +247,7 @@ if __name__ == '__main__': new_fields) LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR) + tb_logger = TensorboardLogger(LOG_DIR, model_name='Speaker_Encoder') try: main(args) diff --git a/mozilla_voice_tts/speaker_encoder/config.json b/mozilla_voice_tts/speaker_encoder/config.json index 281578da..37d976ce 100644 --- a/mozilla_voice_tts/speaker_encoder/config.json +++ b/mozilla_voice_tts/speaker_encoder/config.json @@ -36,6 +36,7 @@ "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. "steps_plot_stats": 10, // number of steps to plot embeddings. "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "wd": 0.000001, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step" "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. diff --git a/mozilla_voice_tts/speaker_encoder/dataset.py b/mozilla_voice_tts/speaker_encoder/dataset.py index 42c75dd9..d3243c13 100644 --- a/mozilla_voice_tts/speaker_encoder/dataset.py +++ b/mozilla_voice_tts/speaker_encoder/dataset.py @@ -31,7 +31,7 @@ class MyDataset(Dataset): print(f" | > Num speakers: {len(self.speakers)}") def load_wav(self, filename): - audio = self.ap.load_wav(filename) + audio = self.ap.load_wav(filename, sr=self.ap.sample_rate) return audio def load_data(self, idx): From bd4c6ee42ae984f16b97ecd9d931e7688a8b39a9 Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 31 Jul 2020 00:55:42 -0300 Subject: [PATCH 46/56] add speaker encoder parameters in config.json, for more easy changes --- mozilla_voice_tts/bin/train_encoder.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mozilla_voice_tts/bin/train_encoder.py b/mozilla_voice_tts/bin/train_encoder.py index 46b7f46f..c89469b9 100644 --- a/mozilla_voice_tts/bin/train_encoder.py +++ b/mozilla_voice_tts/bin/train_encoder.py @@ -135,10 +135,10 @@ def main(args): # pylint: disable=redefined-outer-name global meta_data_eval ap = AudioProcessor(**c.audio) - model = SpeakerEncoder(input_dim=40, - proj_dim=128, - lstm_dim=384, - num_lstm_layers=3) + model = SpeakerEncoder(input_dim=c.model['input_dim'], + proj_dim=c.model['proj_dim'], + lstm_dim=c.model['lstm_dim'], + num_lstm_layers=c.model['num_lstm_layers']) optimizer = RAdam(model.parameters(), lr=c.lr) if c.loss == "ge2e": From f37159c13567223392dcbe0ed07fe6e98cca62fa Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 31 Jul 2020 01:03:03 -0300 Subject: [PATCH 47/56] add Jupyter Notebook for Extract Speaker Embedding per sample using AngleProto --- .../config.json | 60 ------- .../config.json | 60 ------- .../config.json | 60 ------- mozilla_voice_tts/speaker_encoder/losses.py | 4 +- ...- ExtractSpeakerEmbeddings-by-sample.ipynb | 163 ++++++++++++++++++ ...- ExtractSpeakerEmbeddings-by-sample.ipynb | 6 +- 6 files changed, 168 insertions(+), 185 deletions(-) delete mode 100644 checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+37AM-debug/config.json delete mode 100644 checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+38AM-debug/config.json delete mode 100644 checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+40AM-debug/config.json create mode 100644 notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb diff --git a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+37AM-debug/config.json b/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+37AM-debug/config.json deleted file mode 100644 index dbea7ef2..00000000 --- a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+37AM-debug/config.json +++ /dev/null @@ -1,60 +0,0 @@ - -"github_branch":"* dev-gst-embeddings", -{ - "run_name": "libritts_100+360-angleproto", - "run_description": "train speaker encoder for libritts 100 and 360", - "audio":{ - // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. - "num_freq": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60 // threshold for timming silence. Set this according to your dataset. - }, - "reinit_layers": [], - "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) - "grad_clip": 3.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "steps_plot_stats": 10, // number of steps to plot embeddings. - "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "wd": 0.000001, // Weight decay weight. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. - "print_step": 1, // Number of steps to log traning on console. - "output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. - "model": { - "input_dim": 80, // input_dim == num_mels - "proj_dim": 128, - "lstm_dim": 384, - "num_lstm_layers": 3 - }, - "datasets": - [ - { - "name": "vctk", - "path": "../../../datasets/VCTK-Corpus-removed-silence/", - "meta_file_train": null, - "meta_file_val": null - } - ] -} \ No newline at end of file diff --git a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+38AM-debug/config.json b/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+38AM-debug/config.json deleted file mode 100644 index dbea7ef2..00000000 --- a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+38AM-debug/config.json +++ /dev/null @@ -1,60 +0,0 @@ - -"github_branch":"* dev-gst-embeddings", -{ - "run_name": "libritts_100+360-angleproto", - "run_description": "train speaker encoder for libritts 100 and 360", - "audio":{ - // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. - "num_freq": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60 // threshold for timming silence. Set this according to your dataset. - }, - "reinit_layers": [], - "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) - "grad_clip": 3.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "steps_plot_stats": 10, // number of steps to plot embeddings. - "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "wd": 0.000001, // Weight decay weight. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. - "print_step": 1, // Number of steps to log traning on console. - "output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. - "model": { - "input_dim": 80, // input_dim == num_mels - "proj_dim": 128, - "lstm_dim": 384, - "num_lstm_layers": 3 - }, - "datasets": - [ - { - "name": "vctk", - "path": "../../../datasets/VCTK-Corpus-removed-silence/", - "meta_file_train": null, - "meta_file_val": null - } - ] -} \ No newline at end of file diff --git a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+40AM-debug/config.json b/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+40AM-debug/config.json deleted file mode 100644 index dbea7ef2..00000000 --- a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+40AM-debug/config.json +++ /dev/null @@ -1,60 +0,0 @@ - -"github_branch":"* dev-gst-embeddings", -{ - "run_name": "libritts_100+360-angleproto", - "run_description": "train speaker encoder for libritts 100 and 360", - "audio":{ - // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. - "num_freq": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60 // threshold for timming silence. Set this according to your dataset. - }, - "reinit_layers": [], - "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) - "grad_clip": 3.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "steps_plot_stats": 10, // number of steps to plot embeddings. - "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "wd": 0.000001, // Weight decay weight. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. - "print_step": 1, // Number of steps to log traning on console. - "output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. - "model": { - "input_dim": 80, // input_dim == num_mels - "proj_dim": 128, - "lstm_dim": 384, - "num_lstm_layers": 3 - }, - "datasets": - [ - { - "name": "vctk", - "path": "../../../datasets/VCTK-Corpus-removed-silence/", - "meta_file_train": null, - "meta_file_val": null - } - ] -} \ No newline at end of file diff --git a/mozilla_voice_tts/speaker_encoder/losses.py b/mozilla_voice_tts/speaker_encoder/losses.py index 7feced64..9065ccfd 100644 --- a/mozilla_voice_tts/speaker_encoder/losses.py +++ b/mozilla_voice_tts/speaker_encoder/losses.py @@ -23,7 +23,7 @@ class GE2ELoss(nn.Module): self.b = nn.Parameter(torch.tensor(init_b)) self.loss_method = loss_method - print('Initialised Generalized End-to-End loss') + print(' > Initialised Generalized End-to-End loss') assert self.loss_method in ["softmax", "contrast"] @@ -142,7 +142,7 @@ class AngleProtoLoss(nn.Module): self.b = nn.Parameter(torch.tensor(init_b)) self.criterion = torch.nn.CrossEntropyLoss() - print('Initialised Angular Prototypical loss') + print(' > Initialised Angular Prototypical loss') def forward(self, x): """ diff --git a/notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb b/notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb new file mode 100644 index 00000000..d660a7f5 --- /dev/null +++ b/notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb @@ -0,0 +1,163 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is a noteboook used to generate the speaker embeddings with the AngleProto speaker encoder model for multi-speaker training.\n", + "\n", + "Before running this script please DON'T FORGET: \n", + "- to set file paths.\n", + "- to download related model files from TTS.\n", + "- download or clone related repos, linked below.\n", + "- setup the repositories. ```python setup.py install```\n", + "- to checkout right commit versions (given next to the model) of TTS.\n", + "- to set the right paths in the cell below.\n", + "\n", + "Repository:\n", + "- TTS: https://github.com/mozilla/TTS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "import os\n", + "import importlib\n", + "import random\n", + "import librosa\n", + "import torch\n", + "\n", + "import numpy as np\n", + "from tqdm import tqdm\n", + "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", + "\n", + "# you may need to change this depending on your system\n", + "os.environ['CUDA_VISIBLE_DEVICES']='0'\n", + "\n", + "\n", + "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.utils.io import load_config" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should also adjust all the path constants to point at the relevant locations for you locally" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MODEL_RUN_PATH = \"../../Mozilla-TTS/checkpoints/libritts_100+360-angleproto-June-06-2020_04+12PM-9c04d1f/\"\n", + "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n", + "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n", + "\n", + "\n", + "DATASETS_NAME = ['vctk'] # list the datasets\n", + "DATASETS_PATH = ['../../../datasets/VCTK/']\n", + "DATASETS_METAFILE = ['']\n", + "\n", + "USE_CUDA = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Preprocess dataset\n", + "meta_data = []\n", + "for i in range(len(DATASETS_NAME)):\n", + " preprocessor = importlib.import_module('TTS.datasets.preprocess')\n", + " preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n", + " meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n", + " \n", + "meta_data= list(meta_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "c = load_config(CONFIG_PATH)\n", + "ap = AudioProcessor(**c['audio'])\n", + "\n", + "model = SpeakerEncoder(**c.model)\n", + "model.load_state_dict(torch.load(MODEL_PATH)['model'])\n", + "model.eval()\n", + "if USE_CUDA:\n", + " model.cuda()\n", + "\n", + "embeddings_dict = {}\n", + "len_meta_data= len(meta_data)\n", + "\n", + "for i in tqdm(range(len_meta_data)):\n", + " _, wav_file, speaker_id = meta_data[i]\n", + " wav_file_name = os.path.basename(wav_file)\n", + " mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T\n", + " mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n", + " if USE_CUDA:\n", + " mel_spec = mel_spec.cuda()\n", + " embedd = model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n", + " embeddings_dict[wav_file_name] = [embedd,speaker_id]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create and export speakers.json\n", + "speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0].reshape(-1).tolist()} for i, sample in enumerate(embeddings_dict.keys())}\n", + "save_speaker_mapping(MODEL_RUN_PATH, speaker_mapping)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#test load integrity\n", + "speaker_mapping_load = load_speaker_mapping(MODEL_RUN_PATH)\n", + "assert speaker_mapping == speaker_mapping_load\n", + "print(\"The file speakers.json has been exported to \",MODEL_RUN_PATH, ' with ', len(embeddings_dict.keys()), ' speakers')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb b/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb index 324de2d0..2fba4d49 100644 --- a/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb +++ b/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb @@ -63,9 +63,9 @@ "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n", "\n", "\n", - "DATASETS_NAME = ['brspeech'] # list the datasets\n", - "DATASETS_PATH = ['../../../datasets/BRSpeech-2.0-beta8']\n", - "DATASETS_METAFILE = ['TTS_metadata_brspeech2+cv_all_valited_lines.csv']\n", + "DATASETS_NAME = ['vctk'] # list the datasets\n", + "DATASETS_PATH = ['../../../datasets/VCTK/']\n", + "DATASETS_METAFILE = ['']\n", "\n", "USE_CUDA = True" ] From f3b8ef4272b5da8270477173355f794583f002a4 Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 31 Jul 2020 01:12:17 -0300 Subject: [PATCH 48/56] fix Lint check errors --- mozilla_voice_tts/speaker_encoder/losses.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mozilla_voice_tts/speaker_encoder/losses.py b/mozilla_voice_tts/speaker_encoder/losses.py index 9065ccfd..f4687949 100644 --- a/mozilla_voice_tts/speaker_encoder/losses.py +++ b/mozilla_voice_tts/speaker_encoder/losses.py @@ -124,14 +124,14 @@ class GE2ELoss(nn.Module): # adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py class AngleProtoLoss(nn.Module): - """ + """ Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982 Accepts an input of size (N, M, D) where N is the number of speakers in the batch, M is the number of utterances per speaker, and D is the dimensionality of the embedding vector Args: - - init_w (float): defines the initial value of w + - init_w (float): defines the initial value of w - init_b (float): definies the initial value of b """ def __init__(self, init_w=10.0, init_b=-5.0): @@ -148,13 +148,13 @@ class AngleProtoLoss(nn.Module): """ Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) """ - out_anchor = torch.mean(x[:,1:,:],1) - out_positive = x[:,0,:] + out_anchor = torch.mean(x[:, 1:, :], 1) + out_positive = x[:, 0, :] num_speakers = out_anchor.size()[0] - cos_sim_matrix = F.cosine_similarity(out_positive.unsqueeze(-1).expand(-1,-1,num_speakers),out_anchor.unsqueeze(-1).expand(-1,-1,num_speakers).transpose(0,2)) + cos_sim_matrix = F.cosine_similarity(out_positive.unsqueeze(-1).expand(-1, -1, num_speakers),out_anchor.unsqueeze(-1).expand(-1, -1,num_speakers).transpose(0, 2)) torch.clamp(self.w, 1e-6) cos_sim_matrix = cos_sim_matrix * self.w + self.b - label = torch.from_numpy(np.asarray(range(0,num_speakers))).to(cos_sim_matrix.device) + label = torch.from_numpy(np.asarray(range(0, num_speakers))).to(cos_sim_matrix.device) L = self.criterion(cos_sim_matrix, label) return L \ No newline at end of file From 3c6c749de2a0fa7c64ffa4d49872bdfa5a5f0ff4 Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 31 Jul 2020 01:21:31 -0300 Subject: [PATCH 49/56] add test for AngleProtoLoss --- tests/test_encoder.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/tests/test_encoder.py b/tests/test_encoder.py index 711ad195..46266f29 100644 --- a/tests/test_encoder.py +++ b/tests/test_encoder.py @@ -4,7 +4,7 @@ import unittest import torch as T from tests import get_tests_input_path -from mozilla_voice_tts.speaker_encoder.loss import GE2ELoss +from mozilla_voice_tts.speaker_encoder.losses import GE2ELoss, AngleProtoLoss from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder from mozilla_voice_tts.utils.io import load_config @@ -59,6 +59,7 @@ class GE2ELossTests(unittest.TestCase): dummy_input = T.ones(4, 5, 64) # num_speaker x num_utterance x dim loss = GE2ELoss(loss_method="softmax") output = loss.forward(dummy_input) + assert output.item() >= 0.0 # check speaker loss with orthogonal d-vectors dummy_input = T.empty(3, 64) dummy_input = T.nn.init.orthogonal(dummy_input) @@ -73,6 +74,34 @@ class GE2ELossTests(unittest.TestCase): output = loss.forward(dummy_input) assert output.item() < 0.005 +class AngleProtoLossTests(unittest.TestCase): + # pylint: disable=R0201 + def test_in_out(self): + # check random input + dummy_input = T.rand(4, 5, 64) # num_speaker x num_utterance x dim + loss = AngleProtoLoss() + output = loss.forward(dummy_input) + assert output.item() >= 0.0 + + # check all zeros + dummy_input = T.ones(4, 5, 64) # num_speaker x num_utterance x dim + loss = AngleProtoLoss() + output = loss.forward(dummy_input) + assert output.item() >= 0.0 + + # check speaker loss with orthogonal d-vectors + dummy_input = T.empty(3, 64) + dummy_input = T.nn.init.orthogonal(dummy_input) + dummy_input = T.cat( + [ + dummy_input[0].repeat(5, 1, 1).transpose(0, 1), + dummy_input[1].repeat(5, 1, 1).transpose(0, 1), + dummy_input[2].repeat(5, 1, 1).transpose(0, 1), + ] + ) # num_speaker x num_utterance x dim + loss = AngleProtoLoss() + output = loss.forward(dummy_input) + assert output.item() < 0.005 # class LoaderTest(unittest.TestCase): # def test_output(self): From df02e876aeaafd15d479d02ade6bac0bb8aa7a5f Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 31 Jul 2020 01:58:22 -0300 Subject: [PATCH 50/56] fix Lint check errors --- mozilla_voice_tts/speaker_encoder/losses.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mozilla_voice_tts/speaker_encoder/losses.py b/mozilla_voice_tts/speaker_encoder/losses.py index f4687949..750648e5 100644 --- a/mozilla_voice_tts/speaker_encoder/losses.py +++ b/mozilla_voice_tts/speaker_encoder/losses.py @@ -152,7 +152,7 @@ class AngleProtoLoss(nn.Module): out_positive = x[:, 0, :] num_speakers = out_anchor.size()[0] - cos_sim_matrix = F.cosine_similarity(out_positive.unsqueeze(-1).expand(-1, -1, num_speakers),out_anchor.unsqueeze(-1).expand(-1, -1,num_speakers).transpose(0, 2)) + cos_sim_matrix = F.cosine_similarity(out_positive.unsqueeze(-1).expand(-1, -1, num_speakers), out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2)) torch.clamp(self.w, 1e-6) cos_sim_matrix = cos_sim_matrix * self.w + self.b label = torch.from_numpy(np.asarray(range(0, num_speakers))).to(cos_sim_matrix.device) From 26beea0e1bf6d444514d6e99c02f30fbda9240b9 Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 4 Aug 2020 07:58:12 -0300 Subject: [PATCH 51/56] do not resample audio in Dataloader --- mozilla_voice_tts/tts/datasets/TTSDataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mozilla_voice_tts/tts/datasets/TTSDataset.py b/mozilla_voice_tts/tts/datasets/TTSDataset.py index 2ef78e11..1ecca75f 100644 --- a/mozilla_voice_tts/tts/datasets/TTSDataset.py +++ b/mozilla_voice_tts/tts/datasets/TTSDataset.py @@ -72,7 +72,7 @@ class MyDataset(Dataset): self.sort_items() def load_wav(self, filename): - audio = self.ap.load_wav(filename, sr=self.sample_rate) + audio = self.ap.load_wav(filename) return audio @staticmethod From 1d782487f5e6870170bea77758a7dd16d174379b Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 4 Aug 2020 14:43:31 -0300 Subject: [PATCH 52/56] use tacotron abstract for multispeaker common definitions --- mozilla_voice_tts/bin/train_tts.py | 1 + mozilla_voice_tts/tts/models/tacotron.py | 43 +++++++--------- mozilla_voice_tts/tts/models/tacotron2.py | 49 ++++++++----------- .../tts/models/tacotron_abstract.py | 15 ++++++ 4 files changed, 53 insertions(+), 55 deletions(-) diff --git a/mozilla_voice_tts/bin/train_tts.py b/mozilla_voice_tts/bin/train_tts.py index 1b9bc032..2b6cbfd0 100644 --- a/mozilla_voice_tts/bin/train_tts.py +++ b/mozilla_voice_tts/bin/train_tts.py @@ -536,6 +536,7 @@ def main(args): # pylint: disable=redefined-outer-name else: num_speakers = 0 speaker_embedding_dim = None + speaker_mapping = None model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim) diff --git a/mozilla_voice_tts/tts/models/tacotron.py b/mozilla_voice_tts/tts/models/tacotron.py index 3837e63c..ac88133b 100644 --- a/mozilla_voice_tts/tts/models/tacotron.py +++ b/mozilla_voice_tts/tts/models/tacotron.py @@ -27,6 +27,8 @@ class Tacotron(TacotronAbstract): bidirectional_decoder=False, double_decoder_consistency=False, ddc_r=None, + encoder_in_features=256, + decoder_in_features=256, speaker_embedding_dim=None, gst=False, gst_embedding_dim=256, @@ -40,39 +42,28 @@ class Tacotron(TacotronAbstract): forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, separate_stopnet, bidirectional_decoder, double_decoder_consistency, - ddc_r, gst, gst_embedding_dim, gst_num_heads, gst_style_tokens) + ddc_r, encoder_in_features, decoder_in_features, + speaker_embedding_dim, gst, gst_embedding_dim, + gst_num_heads, gst_style_tokens) - # init layer dims - decoder_in_features = 256 - encoder_in_features = 256 - - if speaker_embedding_dim is None: - # if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim - self.embeddings_per_sample = False - speaker_embedding_dim = 256 - else: - # if speaker_embedding_dim is not None we need use speaker embedding per sample - self.embeddings_per_sample = True + # speaker embedding layers + if self.num_speakers > 1: + if not self.embeddings_per_sample: + speaker_embedding_dim = 256 + self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) + self.speaker_embedding.weight.data.normal_(0, 0.3) # speaker and gst embeddings is concat in decoder input - if num_speakers > 1: - decoder_in_features = decoder_in_features + speaker_embedding_dim # add speaker embedding dim - if self.gst: - decoder_in_features = decoder_in_features + gst_embedding_dim # add gst embedding dim + if self.num_speakers > 1: + self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim # embedding layer self.embedding = nn.Embedding(num_chars, 256, padding_idx=0) - - # speaker embedding layers - if num_speakers > 1: - if not self.embeddings_per_sample: - self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim) - self.speaker_embedding.weight.data.normal_(0, 0.3) + self.embedding.weight.data.normal_(0, 0.3) # base model layers - self.embedding.weight.data.normal_(0, 0.3) - self.encoder = Encoder(encoder_in_features) - self.decoder = Decoder(decoder_in_features, decoder_output_dim, r, + self.encoder = Encoder(self.encoder_in_features) + self.decoder = Decoder(self.decoder_in_features, decoder_output_dim, r, memory_size, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, @@ -93,7 +84,7 @@ class Tacotron(TacotronAbstract): # setup DDC if self.double_decoder_consistency: self.coarse_decoder = Decoder( - decoder_in_features, decoder_output_dim, ddc_r, memory_size, + self.decoder_in_features, decoder_output_dim, ddc_r, memory_size, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, separate_stopnet) diff --git a/mozilla_voice_tts/tts/models/tacotron2.py b/mozilla_voice_tts/tts/models/tacotron2.py index 9aeeb3d2..9fa640b0 100644 --- a/mozilla_voice_tts/tts/models/tacotron2.py +++ b/mozilla_voice_tts/tts/models/tacotron2.py @@ -33,6 +33,8 @@ class Tacotron2(TacotronAbstract): bidirectional_decoder=False, double_decoder_consistency=False, ddc_r=None, + encoder_in_features=512, + decoder_in_features=512, speaker_embedding_dim=None, gst=False, gst_embedding_dim=512, @@ -45,38 +47,27 @@ class Tacotron2(TacotronAbstract): forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, separate_stopnet, bidirectional_decoder, double_decoder_consistency, - ddc_r, gst, gst_embedding_dim, gst_num_heads, gst_style_tokens) + ddc_r, encoder_in_features, decoder_in_features, + speaker_embedding_dim, gst, gst_embedding_dim, + gst_num_heads, gst_style_tokens) - # init layer dims - decoder_in_features = 512 - encoder_in_features = 512 - - if speaker_embedding_dim is None: - # if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim - self.embeddings_per_sample = False - speaker_embedding_dim = 512 - else: - # if speaker_embedding_dim is not None we need use speaker embedding per sample - self.embeddings_per_sample = True + # speaker embedding layer + if self.num_speakers > 1: + if not self.embeddings_per_sample: + speaker_embedding_dim = 512 + self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) + self.speaker_embedding.weight.data.normal_(0, 0.3) # speaker and gst embeddings is concat in decoder input - if num_speakers > 1: - decoder_in_features = decoder_in_features + speaker_embedding_dim # add speaker embedding dim - if self.gst: - decoder_in_features = decoder_in_features + gst_embedding_dim # add gst embedding dim - + if self.num_speakers > 1: + self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim + # embedding layer self.embedding = nn.Embedding(num_chars, 512, padding_idx=0) - # speaker embedding layer - if num_speakers > 1: - if not self.embeddings_per_sample: - self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim) - self.speaker_embedding.weight.data.normal_(0, 0.3) - # base model layers - self.encoder = Encoder(encoder_in_features) - self.decoder = Decoder(decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win, + self.encoder = Encoder(self.encoder_in_features) + self.decoder = Decoder(self.decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, separate_stopnet) @@ -85,16 +76,16 @@ class Tacotron2(TacotronAbstract): # global style token layers if self.gst: self.gst_layer = GST(num_mel=80, - num_heads=gst_num_heads, - num_style_tokens=gst_style_tokens, - embedding_dim=gst_embedding_dim) + num_heads=self.gst_num_heads, + num_style_tokens=self.gst_style_tokens, + embedding_dim=self.gst_embedding_dim) # backward pass decoder if self.bidirectional_decoder: self._init_backward_decoder() # setup DDC if self.double_decoder_consistency: self.coarse_decoder = Decoder( - decoder_in_features, self.decoder_output_dim, ddc_r, attn_type, + self.decoder_in_features, self.decoder_output_dim, ddc_r, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, separate_stopnet) diff --git a/mozilla_voice_tts/tts/models/tacotron_abstract.py b/mozilla_voice_tts/tts/models/tacotron_abstract.py index 6f3d32ad..0077f3e4 100644 --- a/mozilla_voice_tts/tts/models/tacotron_abstract.py +++ b/mozilla_voice_tts/tts/models/tacotron_abstract.py @@ -28,6 +28,9 @@ class TacotronAbstract(ABC, nn.Module): bidirectional_decoder=False, double_decoder_consistency=False, ddc_r=None, + encoder_in_features=512, + decoder_in_features=512, + speaker_embedding_dim=None, gst=False, gst_embedding_dim=512, gst_num_heads=4, @@ -57,6 +60,9 @@ class TacotronAbstract(ABC, nn.Module): self.location_attn = location_attn self.attn_K = attn_K self.separate_stopnet = separate_stopnet + self.encoder_in_features = encoder_in_features + self.decoder_in_features = decoder_in_features + self.speaker_embedding_dim = speaker_embedding_dim # layers self.embedding = None @@ -64,8 +70,17 @@ class TacotronAbstract(ABC, nn.Module): self.decoder = None self.postnet = None + # multispeaker + if self.speaker_embedding_dim is None: + # if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim + self.embeddings_per_sample = False + else: + # if speaker_embedding_dim is not None we need use speaker embedding per sample + self.embeddings_per_sample = True + # global style token if self.gst: + self.decoder_in_features += gst_embedding_dim # add gst embedding dim self.gst_layer = None # model states From ac032f00f37704f772f49987e23c335b58073024 Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 4 Aug 2020 15:05:04 -0300 Subject: [PATCH 53/56] add tests for tacotron and tacotron2 Multi-speaker --- tests/test_tacotron2_model.py | 53 +++++++++++++++++++++++++++++ tests/test_tacotron_model.py | 63 +++++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) diff --git a/tests/test_tacotron2_model.py b/tests/test_tacotron2_model.py index 92ffb9aa..0ff79f6e 100644 --- a/tests/test_tacotron2_model.py +++ b/tests/test_tacotron2_model.py @@ -131,6 +131,59 @@ class TacotronGSTTrainTest(unittest.TestCase): count, param.shape, param, param_ref) count += 1 +class MultiSpeakeTacotronTrainTest(unittest.TestCase): + @staticmethod + def test_train_step(): + input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) + input_lengths = torch.randint(100, 128, (8, )).long().to(device) + input_lengths = torch.sort(input_lengths, descending=True)[0] + mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) + mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) + mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + mel_lengths[0] = 30 + stop_targets = torch.zeros(8, 30, 1).float().to(device) + speaker_embeddings = torch.rand(8, 55).to(device) + + for idx in mel_lengths: + stop_targets[:, int(idx.item()):, 0] = 1.0 + + stop_targets = stop_targets.view(input_dummy.shape[0], + stop_targets.size(1) // c.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() + + criterion = MSELossMasked(seq_len_norm=False).to(device) + criterion_st = nn.BCEWithLogitsLoss().to(device) + model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, speaker_embedding_dim=55).to(device) + model.train() + model_ref = copy.deepcopy(model) + count = 0 + for param, param_ref in zip(model.parameters(), + model_ref.parameters()): + assert (param - param_ref).sum() == 0, param + count += 1 + optimizer = optim.Adam(model.parameters(), lr=c.lr) + for i in range(5): + mel_out, mel_postnet_out, align, stop_tokens = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings) + assert torch.sigmoid(stop_tokens).data.max() <= 1.0 + assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + optimizer.zero_grad() + loss = criterion(mel_out, mel_spec, mel_lengths) + stop_loss = criterion_st(stop_tokens, stop_targets) + loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss.backward() + optimizer.step() + # check parameter changes + count = 0 + for param, param_ref in zip(model.parameters(), + model_ref.parameters()): + # ignore pre-higway layer since it works conditional + # if count not in [145, 59]: + assert (param != param_ref).any( + ), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref) + count += 1 + class TacotronGSTTrainTest(unittest.TestCase): @staticmethod def test_train_step(): diff --git a/tests/test_tacotron_model.py b/tests/test_tacotron_model.py index 2b55cbac..0b80243f 100644 --- a/tests/test_tacotron_model.py +++ b/tests/test_tacotron_model.py @@ -89,6 +89,69 @@ class TacotronTrainTest(unittest.TestCase): count, param.shape, param, param_ref) count += 1 +class MultiSpeakeTacotronTrainTest(unittest.TestCase): + @staticmethod + def test_train_step(): + input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) + input_lengths = torch.randint(100, 129, (8, )).long().to(device) + input_lengths[-1] = 128 + mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) + linear_spec = torch.rand(8, 30, c.audio['fft_size']).to(device) + mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + stop_targets = torch.zeros(8, 30, 1).float().to(device) + speaker_embeddings = torch.rand(8, 55).to(device) + + for idx in mel_lengths: + stop_targets[:, int(idx.item()):, 0] = 1.0 + + stop_targets = stop_targets.view(input_dummy.shape[0], + stop_targets.size(1) // c.r, -1) + stop_targets = (stop_targets.sum(2) > + 0.0).unsqueeze(2).float().squeeze() + + criterion = L1LossMasked(seq_len_norm=False).to(device) + criterion_st = nn.BCEWithLogitsLoss().to(device) + model = Tacotron( + num_chars=32, + num_speakers=5, + postnet_output_dim=c.audio['fft_size'], + decoder_output_dim=c.audio['num_mels'], + r=c.r, + memory_size=c.memory_size, + speaker_embedding_dim=55, + ).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor + model.train() + print(" > Num parameters for Tacotron model:%s" % + (count_parameters(model))) + model_ref = copy.deepcopy(model) + count = 0 + for param, param_ref in zip(model.parameters(), + model_ref.parameters()): + assert (param - param_ref).sum() == 0, param + count += 1 + optimizer = optim.Adam(model.parameters(), lr=c.lr) + for _ in range(5): + mel_out, linear_out, align, stop_tokens = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, + speaker_embeddings=speaker_embeddings) + optimizer.zero_grad() + loss = criterion(mel_out, mel_spec, mel_lengths) + stop_loss = criterion_st(stop_tokens, stop_targets) + loss = loss + criterion(linear_out, linear_spec, + mel_lengths) + stop_loss + loss.backward() + optimizer.step() + # check parameter changes + count = 0 + for param, param_ref in zip(model.parameters(), + model_ref.parameters()): + # ignore pre-higway layer since it works conditional + # if count not in [145, 59]: + assert (param != param_ref).any( + ), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref) + count += 1 + class TacotronGSTTrainTest(unittest.TestCase): @staticmethod def test_train_step(): From 07c961382fcc1882a80e4ef65e3c9a357928d49d Mon Sep 17 00:00:00 2001 From: Edresson Date: Wed, 5 Aug 2020 08:43:27 -0300 Subject: [PATCH 54/56] add support for CorentinJ Speaker encoder and add notebook for extract embeddings --- mozilla_voice_tts/speaker_encoder/config.json | 27 +- mozilla_voice_tts/speaker_encoder/model.py | 38 +- ...- ExtractSpeakerEmbeddings-by-sample.ipynb | 2 +- ...J-ExtractSpeakerEmbeddings-by-sample.ipynb | 25495 ++++++++++++++++ 4 files changed, 25541 insertions(+), 21 deletions(-) create mode 100644 notebooks/GE2E-CorentinJ-ExtractSpeakerEmbeddings-by-sample.ipynb diff --git a/mozilla_voice_tts/speaker_encoder/config.json b/mozilla_voice_tts/speaker_encoder/config.json index 37d976ce..11da0cf6 100644 --- a/mozilla_voice_tts/speaker_encoder/config.json +++ b/mozilla_voice_tts/speaker_encoder/config.json @@ -1,14 +1,14 @@ { - "run_name": "libritts_100+360-angleproto", - "run_description": "train speaker encoder for libritts 100 and 360", + "run_name": "Model compatible to CorentinJ/Real-Time-Voice-Cloning", + "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ", "audio":{ // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. - "num_freq": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. + "num_mels": 40, // size of the mel spec frame. + "fft_size": 400, // number of stft frequency levels. Size of the linear spectogram frame. + "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "win_length": 400, // stft window length in ms. + "hop_length": 160, // stft window hop-lengh in ms. "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. @@ -27,7 +27,7 @@ "trim_db": 60 // threshold for timming silence. Set this according to your dataset. }, "reinit_layers": [], - "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) + "loss": "ge2e", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) "grad_clip": 3.0, // upper limit for gradients for clipping. "epochs": 1000, // total number of epochs to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. @@ -41,12 +41,13 @@ "checkpoint": true, // If true, it saves checkpoints per "save_step" "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. "print_step": 1, // Number of steps to log traning on console. - "output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. + "output_path": "../../checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. "model": { - "input_dim": 80, // input_dim == num_mels - "proj_dim": 128, - "lstm_dim": 384, - "num_lstm_layers": 3 + "input_dim": 40, + "proj_dim": 256, + "lstm_dim": 256, + "num_lstm_layers": 3, + "use_lstm_with_projection": false }, "datasets": [ diff --git a/mozilla_voice_tts/speaker_encoder/model.py b/mozilla_voice_tts/speaker_encoder/model.py index ca2abe31..df0527bc 100644 --- a/mozilla_voice_tts/speaker_encoder/model.py +++ b/mozilla_voice_tts/speaker_encoder/model.py @@ -16,15 +16,33 @@ class LSTMWithProjection(nn.Module): o, (_, _) = self.lstm(x) return self.linear(o) +class LSTMWithoutProjection(nn.Module): + def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers): + super().__init__() + self.lstm = nn.LSTM(input_size=input_dim, + hidden_size=lstm_dim, + num_layers=num_lstm_layers, + batch_first=True) + self.linear = nn.Linear(lstm_dim, proj_dim, bias=True) + self.relu = nn.ReLU() + def forward(self, x): + _, (hidden, _) = self.lstm(x) + return self.relu(self.linear(hidden[-1])) class SpeakerEncoder(nn.Module): - def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3): + def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3, use_lstm_with_projection=True): super().__init__() + self.use_lstm_with_projection = use_lstm_with_projection layers = [] - layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim)) - for _ in range(num_lstm_layers - 1): - layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim)) - self.layers = nn.Sequential(*layers) + # choise LSTM layer + if use_lstm_with_projection: + layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim)) + for _ in range(num_lstm_layers - 1): + layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim)) + self.layers = nn.Sequential(*layers) + else: + self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers) + self._init_layers() def _init_layers(self): @@ -37,12 +55,18 @@ class SpeakerEncoder(nn.Module): def forward(self, x): # TODO: implement state passing for lstms d = self.layers(x) - d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1) + if self.use_lstm_with_projection: + d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1) + else: + d = torch.nn.functional.normalize(d, p=2, dim=1) return d def inference(self, x): d = self.layers.forward(x) - d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1) + if self.use_lstm_with_projection: + d = torch.nn.functional.normalize(d[:, -1], p=2, dim=1) + else: + d = torch.nn.functional.normalize(d, p=2, dim=1) return d def compute_embedding(self, x, num_frames=160, overlap=0.5): diff --git a/notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb b/notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb index d660a7f5..15206130 100644 --- a/notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb +++ b/notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb @@ -79,7 +79,7 @@ "#Preprocess dataset\n", "meta_data = []\n", "for i in range(len(DATASETS_NAME)):\n", - " preprocessor = importlib.import_module('TTS.datasets.preprocess')\n", + " preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n", " preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n", " meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n", " \n", diff --git a/notebooks/GE2E-CorentinJ-ExtractSpeakerEmbeddings-by-sample.ipynb b/notebooks/GE2E-CorentinJ-ExtractSpeakerEmbeddings-by-sample.ipynb new file mode 100644 index 00000000..576a95fe --- /dev/null +++ b/notebooks/GE2E-CorentinJ-ExtractSpeakerEmbeddings-by-sample.ipynb @@ -0,0 +1,25495 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is a noteboook used to generate the speaker embeddings with the CorentinJ GE2E model trained with Angular Prototypical loss for multi-speaker training.\n", + "\n", + "Before running this script please DON'T FORGET:\n", + "- to set the right paths in the cell below.\n", + "\n", + "Repositories:\n", + "- TTS: https://github.com/mozilla/TTS\n", + "- CorentinJ GE2E: https://github.com/Edresson/GE2E-Speaker-Encoder" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import os\n", + "import importlib\n", + "import random\n", + "import librosa\n", + "import torch\n", + "\n", + "import numpy as np\n", + "from TTS.utils.io import load_config\n", + "from tqdm import tqdm\n", + "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", + "\n", + "# you may need to change this depending on your system\n", + "os.environ['CUDA_VISIBLE_DEVICES']='0'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cloning into 'Real-Time-Voice-Cloning'...\n", + "remote: Enumerating objects: 5, done.\u001b[K\n", + "remote: Counting objects: 100% (5/5), done.\u001b[K\n", + "remote: Compressing objects: 100% (5/5), done.\u001b[K\n", + "remote: Total 2508 (delta 0), reused 3 (delta 0), pack-reused 2503\u001b[K\n", + "Receiving objects: 100% (2508/2508), 360.78 MiB | 17.84 MiB/s, done.\n", + "Resolving deltas: 100% (1387/1387), done.\n", + "Checking connectivity... done.\n" + ] + } + ], + "source": [ + "# Clone encoder \n", + "!git clone https://github.com/CorentinJ/Real-Time-Voice-Cloning.git\n", + "os.chdir('Real-Time-Voice-Cloning/')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#Install voxceleb_trainer Requeriments\n", + "!python -m pip install umap-learn visdom webrtcvad librosa>=0.5.1 matplotlib>=2.0.2 numpy>=1.14.0 scipy>=1.0.0 tqdm sounddevice Unidecode inflect multiprocess numba" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2020-08-05 06:51:05-- https://github.com/Edresson/Real-Time-Voice-Cloning/releases/download/checkpoints/pretrained.zip\n", + "Resolving github.com (github.com)... 18.231.5.6\n", + "Connecting to github.com (github.com)|18.231.5.6|:443... connected.\n", + "HTTP request sent, awaiting response... 301 Moved Permanently\n", + "Location: https://github.com/Edresson/GE2E-Speaker-Encoder/releases/download/checkpoints/pretrained.zip [following]\n", + "--2020-08-05 06:51:05-- https://github.com/Edresson/GE2E-Speaker-Encoder/releases/download/checkpoints/pretrained.zip\n", + "Reusing existing connection to github.com:443.\n", + "HTTP request sent, awaiting response... 302 Found\n", + "Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/263893598/f7f31d80-96df-11ea-8345-261fc35f9849?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200805%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200805T101614Z&X-Amz-Expires=300&X-Amz-Signature=df7724c28668ebd5dfbcc6a9b51f6afb78193c30119f3a1c3eef678188aabd1e&X-Amz-SignedHeaders=host&actor_id=0&repo_id=263893598&response-content-disposition=attachment%3B%20filename%3Dpretrained.zip&response-content-type=application%2Foctet-stream [following]\n", + "--2020-08-05 06:51:05-- https://github-production-release-asset-2e65be.s3.amazonaws.com/263893598/f7f31d80-96df-11ea-8345-261fc35f9849?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200805%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200805T101614Z&X-Amz-Expires=300&X-Amz-Signature=df7724c28668ebd5dfbcc6a9b51f6afb78193c30119f3a1c3eef678188aabd1e&X-Amz-SignedHeaders=host&actor_id=0&repo_id=263893598&response-content-disposition=attachment%3B%20filename%3Dpretrained.zip&response-content-type=application%2Foctet-stream\n", + "Resolving github-production-release-asset-2e65be.s3.amazonaws.com (github-production-release-asset-2e65be.s3.amazonaws.com)... 52.216.18.24\n", + "Connecting to github-production-release-asset-2e65be.s3.amazonaws.com (github-production-release-asset-2e65be.s3.amazonaws.com)|52.216.18.24|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 383640573 (366M) [application/octet-stream]\n", + "Saving to: ‘pretrained.zip’\n", + "\n", + "pretrained.zip 100%[===================>] 365,87M 6,62MB/s in 56s \n", + "\n", + "2020-08-05 06:52:03 (6,48 MB/s) - ‘pretrained.zip’ saved [383640573/383640573]\n", + "\n", + "Archive: pretrained.zip\n", + " creating: encoder/saved_models/\n", + " inflating: encoder/saved_models/pretrained.pt \n", + " creating: synthesizer/saved_models/\n", + " creating: synthesizer/saved_models/logs-pretrained/\n", + " creating: synthesizer/saved_models/logs-pretrained/taco_pretrained/\n", + " extracting: synthesizer/saved_models/logs-pretrained/taco_pretrained/checkpoint \n", + " inflating: synthesizer/saved_models/logs-pretrained/taco_pretrained/tacotron_model.ckpt-278000.data-00000-of-00001 \n", + " inflating: synthesizer/saved_models/logs-pretrained/taco_pretrained/tacotron_model.ckpt-278000.index \n", + " inflating: synthesizer/saved_models/logs-pretrained/taco_pretrained/tacotron_model.ckpt-278000.meta \n", + " creating: vocoder/saved_models/\n", + " creating: vocoder/saved_models/pretrained/\n", + " inflating: vocoder/saved_models/pretrained/pretrained.pt \n" + ] + } + ], + "source": [ + "#Download encoder Checkpoint\n", + "!wget https://github.com/Edresson/Real-Time-Voice-Cloning/releases/download/checkpoints/pretrained.zip\n", + "!unzip pretrained.zip" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from encoder import inference as encoder\n", + "from encoder.params_model import model_embedding_size as speaker_embedding_size\n", + "from pathlib import Path" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preparing the encoder, the synthesizer and the vocoder...\n", + "Loaded encoder \"pretrained.pt\" trained to step 1564501\n", + "Testing your configuration with small inputs.\n", + "\tTesting the encoder...\n", + "(256,)\n" + ] + } + ], + "source": [ + "print(\"Preparing the encoder, the synthesizer and the vocoder...\")\n", + "encoder.load_model(Path('encoder/saved_models/pretrained.pt'))\n", + "print(\"Testing your configuration with small inputs.\")\n", + "# Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's\n", + "# sampling rate, which may differ.\n", + "# If you're unfamiliar with digital audio, know that it is encoded as an array of floats \n", + "# (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.\n", + "# The sampling rate is the number of values (samples) recorded per second, it is set to\n", + "# 16000 for the encoder. Creating an array of length will always correspond \n", + "# to an audio of 1 second.\n", + "print(\"\\tTesting the encoder...\")\n", + "\n", + "wav = np.zeros(encoder.sampling_rate) \n", + "embed = encoder.embed_utterance(wav)\n", + "print(embed.shape)\n", + "\n", + "# Embeddings are L2-normalized (this isn't important here, but if you want to make your own \n", + "# embeddings it will be).\n", + "#embed /= np.linalg.norm(embed) # for random embedding\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "SAVE_PATH = '../'" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Set constants\n", + "DATASETS_NAME = ['vctk'] # list the datasets\n", + "DATASETS_PATH = ['../../../../../datasets/VCTK-Corpus-removed-silence/']\n", + "DATASETS_METAFILE = ['']\n", + "USE_CUDA = True" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " 0%| | 0/44063 [00:00 Date: Wed, 5 Aug 2020 20:19:23 +0200 Subject: [PATCH 55/56] rebase fixes --- mozilla_voice_tts/bin/train_encoder.py | 15 +- mozilla_voice_tts/speaker_encoder/loss.py | 163 ---------------- mozilla_voice_tts/speaker_encoder/losses.py | 2 +- mozilla_voice_tts/tts/datasets/preprocess.py | 32 ++- mozilla_voice_tts/tts/models/tacotron.py | 5 +- mozilla_voice_tts/tts/models/tacotron2.py | 12 +- .../tts/models/tacotron_abstract.py | 4 +- synthesize.py | 182 ------------------ tests/test_tacotron2_model.py | 59 +----- 9 files changed, 41 insertions(+), 433 deletions(-) delete mode 100644 mozilla_voice_tts/speaker_encoder/loss.py delete mode 100644 synthesize.py diff --git a/mozilla_voice_tts/bin/train_encoder.py b/mozilla_voice_tts/bin/train_encoder.py index c89469b9..f9bfea7f 100644 --- a/mozilla_voice_tts/bin/train_encoder.py +++ b/mozilla_voice_tts/bin/train_encoder.py @@ -10,21 +10,21 @@ import traceback import torch from torch.utils.data import DataLoader -from mozilla_voice_tts.generic_utils import count_parameters from mozilla_voice_tts.speaker_encoder.dataset import MyDataset from mozilla_voice_tts.speaker_encoder.generic_utils import save_best_model -from mozilla_voice_tts.speaker_encoder.losses import GE2ELoss +from mozilla_voice_tts.speaker_encoder.losses import GE2ELoss, AngleProtoLoss from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder from mozilla_voice_tts.speaker_encoder.visual import plot_embeddings from mozilla_voice_tts.tts.datasets.preprocess import load_meta_data -from mozilla_voice_tts.tts.utils.audio import AudioProcessor from mozilla_voice_tts.tts.utils.generic_utils import ( create_experiment_folder, get_git_branch, remove_experiment_folder, set_init_dict) from mozilla_voice_tts.tts.utils.io import copy_config_file, load_config -from mozilla_voice_tts.tts.utils.radam import RAdam -from mozilla_voice_tts.tts.utils.tensorboard_logger import TensorboardLogger -from mozilla_voice_tts.tts.utils.training import NoamLR, check_update +from mozilla_voice_tts.utils.audio import AudioProcessor +from mozilla_voice_tts.utils.generic_utils import count_parameters +from mozilla_voice_tts.utils.radam import RAdam +from mozilla_voice_tts.utils.tensorboard_logger import TensorboardLogger +from mozilla_voice_tts.utils.training import NoamLR, check_update torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True @@ -146,7 +146,7 @@ def main(args): # pylint: disable=redefined-outer-name elif c.loss == "angleproto": criterion = AngleProtoLoss() else: - raise Exception("The %s not is a loss supported" %c.loss) + raise Exception("The %s not is a loss supported" % c.loss) if args.restore_path: checkpoint = torch.load(args.restore_path) @@ -192,6 +192,7 @@ def main(args): # pylint: disable=redefined-outer-name _, global_step = train(model, criterion, optimizer, scheduler, ap, global_step) + if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( diff --git a/mozilla_voice_tts/speaker_encoder/loss.py b/mozilla_voice_tts/speaker_encoder/loss.py deleted file mode 100644 index 6f83be63..00000000 --- a/mozilla_voice_tts/speaker_encoder/loss.py +++ /dev/null @@ -1,163 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F - - -# adapted from https://github.com/cvqluu/GE2E-Loss -class GE2ELoss(nn.Module): - def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"): - """ - Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1] - Accepts an input of size (N, M, D) - where N is the number of speakers in the batch, - M is the number of utterances per speaker, - and D is the dimensionality of the embedding vector (e.g. d-vector) - Args: - - init_w (float): defines the initial value of w in Equation (5) of [1] - - init_b (float): definies the initial value of b in Equation (5) of [1] - """ - super(GE2ELoss, self).__init__() - # pylint: disable=E1102 - self.w = nn.Parameter(torch.tensor(init_w)) - # pylint: disable=E1102 - self.b = nn.Parameter(torch.tensor(init_b)) - self.loss_method = loss_method - - print('Initialised Generalized End-to-End loss') - - assert self.loss_method in ["softmax", "contrast"] - - if self.loss_method == "softmax": - self.embed_loss = self.embed_loss_softmax - if self.loss_method == "contrast": - self.embed_loss = self.embed_loss_contrast - - # pylint: disable=R0201 - def calc_new_centroids(self, dvecs, centroids, spkr, utt): - """ - Calculates the new centroids excluding the reference utterance - """ - excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :])) - excl = torch.mean(excl, 0) - new_centroids = [] - for i, centroid in enumerate(centroids): - if i == spkr: - new_centroids.append(excl) - else: - new_centroids.append(centroid) - return torch.stack(new_centroids) - - def calc_cosine_sim(self, dvecs, centroids): - """ - Make the cosine similarity matrix with dims (N,M,N) - """ - cos_sim_matrix = [] - for spkr_idx, speaker in enumerate(dvecs): - cs_row = [] - for utt_idx, utterance in enumerate(speaker): - new_centroids = self.calc_new_centroids( - dvecs, centroids, spkr_idx, utt_idx - ) - # vector based cosine similarity for speed - cs_row.append( - torch.clamp( - torch.mm( - utterance.unsqueeze(1).transpose(0, 1), - new_centroids.transpose(0, 1), - ) - / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)), - 1e-6, - ) - ) - cs_row = torch.cat(cs_row, dim=0) - cos_sim_matrix.append(cs_row) - return torch.stack(cos_sim_matrix) - - # pylint: disable=R0201 - def embed_loss_softmax(self, dvecs, cos_sim_matrix): - """ - Calculates the loss on each embedding $L(e_{ji})$ by taking softmax - """ - N, M, _ = dvecs.shape - L = [] - for j in range(N): - L_row = [] - for i in range(M): - L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j]) - L_row = torch.stack(L_row) - L.append(L_row) - return torch.stack(L) - - # pylint: disable=R0201 - def embed_loss_contrast(self, dvecs, cos_sim_matrix): - """ - Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid - """ - N, M, _ = dvecs.shape - L = [] - for j in range(N): - L_row = [] - for i in range(M): - centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i]) - excl_centroids_sigmoids = torch.cat( - (centroids_sigmoids[:j], centroids_sigmoids[j + 1 :]) - ) - L_row.append( - 1.0 - - torch.sigmoid(cos_sim_matrix[j, i, j]) - + torch.max(excl_centroids_sigmoids) - ) - L_row = torch.stack(L_row) - L.append(L_row) - return torch.stack(L) - - def forward(self, dvecs): - """ - Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) - """ - centroids = torch.mean(dvecs, 1) - cos_sim_matrix = self.calc_cosine_sim(dvecs, centroids) - torch.clamp(self.w, 1e-6) - cos_sim_matrix = self.w * cos_sim_matrix + self.b - L = self.embed_loss(dvecs, cos_sim_matrix) - return L.mean() - -# adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py -class AngleProtoLoss(nn.Module): - """ - Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982 - Accepts an input of size (N, M, D) - where N is the number of speakers in the batch, - M is the number of utterances per speaker, - and D is the dimensionality of the embedding vector - Args: - - init_w (float): defines the initial value of w - - init_b (float): definies the initial value of b - """ - def __init__(self, init_w=10.0, init_b=-5.0): - super(AngleProtoLoss, self).__init__() - # pylint: disable=E1102 - self.w = nn.Parameter(torch.tensor(init_w)) - # pylint: disable=E1102 - self.b = nn.Parameter(torch.tensor(init_b)) - self.criterion = torch.nn.CrossEntropyLoss() - self.use_cuda = torch.cuda.is_available() - - print('Initialised Angular Prototypical loss') - - def forward(self, x): - """ - Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) - """ - out_anchor = torch.mean(x[:,1:,:],1) - out_positive = x[:,0,:] - num_speakers = out_anchor.size()[0] - - cos_sim_matrix = F.cosine_similarity(out_positive.unsqueeze(-1).expand(-1,-1,num_speakers),out_anchor.unsqueeze(-1).expand(-1,-1,num_speakers).transpose(0,2)) - torch.clamp(self.w, 1e-6) - cos_sim_matrix = cos_sim_matrix * self.w + self.b - label = torch.from_numpy(np.asarray(range(0,num_speakers))) - if self.use_cuda: - label = label.cuda() - L = self.criterion(cos_sim_matrix, label) - return L \ No newline at end of file diff --git a/mozilla_voice_tts/speaker_encoder/losses.py b/mozilla_voice_tts/speaker_encoder/losses.py index 750648e5..35ff73fa 100644 --- a/mozilla_voice_tts/speaker_encoder/losses.py +++ b/mozilla_voice_tts/speaker_encoder/losses.py @@ -157,4 +157,4 @@ class AngleProtoLoss(nn.Module): cos_sim_matrix = cos_sim_matrix * self.w + self.b label = torch.from_numpy(np.asarray(range(0, num_speakers))).to(cos_sim_matrix.device) L = self.criterion(cos_sim_matrix, label) - return L \ No newline at end of file + return L diff --git a/mozilla_voice_tts/tts/datasets/preprocess.py b/mozilla_voice_tts/tts/datasets/preprocess.py index 7865652a..ece3bcb6 100644 --- a/mozilla_voice_tts/tts/datasets/preprocess.py +++ b/mozilla_voice_tts/tts/datasets/preprocess.py @@ -93,9 +93,10 @@ def mozilla_de(root_path, meta_file): def mailabs(root_path, meta_files=None): """Normalizes M-AI-Labs meta data files to TTS format""" - speaker_regex = re.compile("by_book/(male|female)/(?P[^/]+)/") + speaker_regex = re.compile( + "by_book/(male|female)/(?P[^/]+)/") if meta_files is None: - csv_files = glob(root_path+"/**/metadata.csv", recursive=True) + csv_files = glob(root_path + "/**/metadata.csv", recursive=True) else: csv_files = meta_files # meta_files = [f.strip() for f in meta_files.split(",")] @@ -115,12 +116,15 @@ def mailabs(root_path, meta_files=None): if meta_files is None: wav_file = os.path.join(folder, 'wavs', cols[0] + '.wav') else: - wav_file = os.path.join(root_path, folder.replace("metadata.csv", ""), 'wavs', cols[0] + '.wav') + wav_file = os.path.join(root_path, + folder.replace("metadata.csv", ""), + 'wavs', cols[0] + '.wav') if os.path.isfile(wav_file): text = cols[1].strip() items.append([text, wav_file, speaker_name]) else: - raise RuntimeError("> File %s does not exist!"%(wav_file)) + raise RuntimeError("> File %s does not exist!" % + (wav_file)) return items @@ -185,7 +189,8 @@ def libri_tts(root_path, meta_files=None): text = cols[1] items.append([text, wav_file, speaker_name]) for item in items: - assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}" + assert os.path.exists( + item[1]), f" [!] wav files don't exist - {item[1]}" return items @@ -197,7 +202,8 @@ def custom_turkish(root_path, meta_file): with open(txt_file, 'r', encoding='utf-8') as ttf: for line in ttf: cols = line.split('|') - wav_file = os.path.join(root_path, 'wavs', cols[0].strip() + '.wav') + wav_file = os.path.join(root_path, 'wavs', + cols[0].strip() + '.wav') if not os.path.exists(wav_file): skipped_files.append(wav_file) continue @@ -206,6 +212,7 @@ def custom_turkish(root_path, meta_file): print(f" [!] {len(skipped_files)} files skipped. They don't exist...") return items + # ToDo: add the dataset link when the dataset is released publicly def brspeech(root_path, meta_file): '''BRSpeech 3.0 beta''' @@ -223,20 +230,25 @@ def brspeech(root_path, meta_file): items.append([text, wav_file, speaker_name]) return items + def vctk(root_path, meta_files=None, wavs_path='wav48'): """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" test_speakers = meta_files items = [] - meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) + meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", + recursive=True) for meta_file in meta_files: - _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) + _, speaker_id, txt_file = os.path.relpath(meta_file, + root_path).split(os.sep) file_id = txt_file.split('.')[0] - if isinstance(test_speakers, list): # if is list ignore this speakers ids + if isinstance(test_speakers, + list): # if is list ignore this speakers ids if speaker_id in test_speakers: continue with open(meta_file) as file_text: text = file_text.readlines()[0] - wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id+'.wav') + wav_file = os.path.join(root_path, wavs_path, speaker_id, + file_id + '.wav') items.append([text, wav_file, speaker_id]) return items \ No newline at end of file diff --git a/mozilla_voice_tts/tts/models/tacotron.py b/mozilla_voice_tts/tts/models/tacotron.py index ac88133b..1dcf2fc8 100644 --- a/mozilla_voice_tts/tts/models/tacotron.py +++ b/mozilla_voice_tts/tts/models/tacotron.py @@ -6,6 +6,7 @@ from mozilla_voice_tts.tts.layers.gst_layers import GST from mozilla_voice_tts.tts.layers.tacotron import Decoder, Encoder, PostCBHG from mozilla_voice_tts.tts.models.tacotron_abstract import TacotronAbstract + class Tacotron(TacotronAbstract): def __init__(self, num_chars, @@ -42,8 +43,8 @@ class Tacotron(TacotronAbstract): forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, separate_stopnet, bidirectional_decoder, double_decoder_consistency, - ddc_r, encoder_in_features, decoder_in_features, - speaker_embedding_dim, gst, gst_embedding_dim, + ddc_r, encoder_in_features, decoder_in_features, + speaker_embedding_dim, gst, gst_embedding_dim, gst_num_heads, gst_style_tokens) # speaker embedding layers diff --git a/mozilla_voice_tts/tts/models/tacotron2.py b/mozilla_voice_tts/tts/models/tacotron2.py index 9fa640b0..a9ba442c 100644 --- a/mozilla_voice_tts/tts/models/tacotron2.py +++ b/mozilla_voice_tts/tts/models/tacotron2.py @@ -1,15 +1,9 @@ import torch from torch import nn -<<<<<<< HEAD:mozilla_voice_tts/tts/models/tacotron2.py from mozilla_voice_tts.tts.layers.gst_layers import GST from mozilla_voice_tts.tts.layers.tacotron2 import Decoder, Encoder, Postnet from mozilla_voice_tts.tts.models.tacotron_abstract import TacotronAbstract -======= -from TTS.tts.layers.gst_layers import GST -from TTS.tts.layers.tacotron2 import Decoder, Encoder, Postnet -from TTS.tts.models.tacotron_abstract import TacotronAbstract ->>>>>>> bugfix in DDC now DDC work on Tacotron1:TTS/tts/models/tacotron2.py # TODO: match function arguments with tacotron class Tacotron2(TacotronAbstract): @@ -47,8 +41,8 @@ class Tacotron2(TacotronAbstract): forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, separate_stopnet, bidirectional_decoder, double_decoder_consistency, - ddc_r, encoder_in_features, decoder_in_features, - speaker_embedding_dim, gst, gst_embedding_dim, + ddc_r, encoder_in_features, decoder_in_features, + speaker_embedding_dim, gst, gst_embedding_dim, gst_num_heads, gst_style_tokens) # speaker embedding layer @@ -61,7 +55,7 @@ class Tacotron2(TacotronAbstract): # speaker and gst embeddings is concat in decoder input if self.num_speakers > 1: self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim - + # embedding layer self.embedding = nn.Embedding(num_chars, 512, padding_idx=0) diff --git a/mozilla_voice_tts/tts/models/tacotron_abstract.py b/mozilla_voice_tts/tts/models/tacotron_abstract.py index 0077f3e4..d98d03b7 100644 --- a/mozilla_voice_tts/tts/models/tacotron_abstract.py +++ b/mozilla_voice_tts/tts/models/tacotron_abstract.py @@ -28,8 +28,8 @@ class TacotronAbstract(ABC, nn.Module): bidirectional_decoder=False, double_decoder_consistency=False, ddc_r=None, - encoder_in_features=512, - decoder_in_features=512, + encoder_in_features=512, + decoder_in_features=512, speaker_embedding_dim=None, gst=False, gst_embedding_dim=512, diff --git a/synthesize.py b/synthesize.py deleted file mode 100644 index bd720123..00000000 --- a/synthesize.py +++ /dev/null @@ -1,182 +0,0 @@ -# pylint: disable=redefined-outer-name, unused-argument -import os -import time -import argparse -import torch -import json -import string - -from TTS.utils.synthesis import synthesis -from TTS.utils.generic_utils import setup_model -from TTS.utils.io import load_config -from TTS.utils.text.symbols import make_symbols, symbols, phonemes -from TTS.utils.audio import AudioProcessor - - -def tts(model, - vocoder_model, - C, - VC, - text, - ap, - ap_vocoder, - use_cuda, - batched_vocoder, - speaker_id=None, - figures=False): - t_1 = time.time() - use_vocoder_model = vocoder_model is not None - waveform, alignment, _, postnet_output, stop_tokens, _ = synthesis( - model, text, C, use_cuda, ap, speaker_id, style_wav=C.gst['gst_style_input'], - truncated=False, enable_eos_bos_chars=C.enable_eos_bos_chars, - use_griffin_lim=(not use_vocoder_model), do_trim_silence=True) - - if C.model == "Tacotron" and use_vocoder_model: - postnet_output = ap.out_linear_to_mel(postnet_output.T).T - # correct if there is a scale difference b/w two models - if use_vocoder_model: - postnet_output = ap._denormalize(postnet_output) - postnet_output = ap_vocoder._normalize(postnet_output) - vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) - waveform = vocoder_model.generate( - vocoder_input.cuda() if use_cuda else vocoder_input, - batched=batched_vocoder, - target=8000, - overlap=400) - print(" > Run-time: {}".format(time.time() - t_1)) - return alignment, postnet_output, stop_tokens, waveform - - -if __name__ == "__main__": - - global symbols, phonemes - - parser = argparse.ArgumentParser() - parser.add_argument('text', type=str, help='Text to generate speech.') - parser.add_argument('config_path', - type=str, - help='Path to model config file.') - parser.add_argument( - 'model_path', - type=str, - help='Path to model file.', - ) - parser.add_argument( - 'out_path', - type=str, - help='Path to save final wav file. Wav file will be names as the text given.', - ) - parser.add_argument('--use_cuda', - type=bool, - help='Run model on CUDA.', - default=False) - parser.add_argument( - '--vocoder_path', - type=str, - help= - 'Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).', - default="", - ) - parser.add_argument('--vocoder_config_path', - type=str, - help='Path to vocoder model config file.', - default="") - parser.add_argument( - '--batched_vocoder', - type=bool, - help="If True, vocoder model uses faster batch processing.", - default=True) - parser.add_argument('--speakers_json', - type=str, - help="JSON file for multi-speaker model.", - default="") - parser.add_argument( - '--speaker_id', - type=int, - help="target speaker_id if the model is multi-speaker.", - default=None) - args = parser.parse_args() - - if args.vocoder_path != "": - assert args.use_cuda, " [!] Enable cuda for vocoder." - from WaveRNN.models.wavernn import Model as VocoderModel - - # load the config - C = load_config(args.config_path) - C.forward_attn_mask = True - - # load the audio processor - ap = AudioProcessor(**C.audio) - - # if the vocabulary was passed, replace the default - if 'characters' in C.keys(): - symbols, phonemes = make_symbols(**C.characters) - - # load speakers - if args.speakers_json != '': - speakers = json.load(open(args.speakers_json, 'r')) - num_speakers = len(speakers) - else: - num_speakers = 0 - - # load the model - num_chars = len(phonemes) if C.use_phonemes else len(symbols) - model = setup_model(num_chars, num_speakers, C) - cp = torch.load(args.model_path) - model.load_state_dict(cp['model']) - model.eval() - if args.use_cuda: - model.cuda() - model.decoder.set_r(cp['r']) - - # load vocoder model - if args.vocoder_path != "": - VC = load_config(args.vocoder_config_path) - ap_vocoder = AudioProcessor(**VC.audio) - bits = 10 - vocoder_model = VocoderModel(rnn_dims=512, - fc_dims=512, - mode=VC.mode, - mulaw=VC.mulaw, - pad=VC.pad, - upsample_factors=VC.upsample_factors, - feat_dims=VC.audio["num_mels"], - compute_dims=128, - res_out_dims=128, - res_blocks=10, - hop_length=ap.hop_length, - sample_rate=ap.sample_rate, - use_aux_net=True, - use_upsample_net=True) - - check = torch.load(args.vocoder_path) - vocoder_model.load_state_dict(check['model']) - vocoder_model.eval() - if args.use_cuda: - vocoder_model.cuda() - else: - vocoder_model = None - VC = None - ap_vocoder = None - - # synthesize voice - print(" > Text: {}".format(args.text)) - _, _, _, wav = tts(model, - vocoder_model, - C, - VC, - args.text, - ap, - ap_vocoder, - args.use_cuda, - args.batched_vocoder, - speaker_id=args.speaker_id, - figures=False) - - # save the results - file_name = args.text.replace(" ", "_") - file_name = file_name.translate( - str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav' - out_path = os.path.join(args.out_path, file_name) - print(" > Saving output to {}".format(out_path)) - ap.save_wav(wav, out_path) diff --git a/tests/test_tacotron2_model.py b/tests/test_tacotron2_model.py index 0ff79f6e..28d39de5 100644 --- a/tests/test_tacotron2_model.py +++ b/tests/test_tacotron2_model.py @@ -76,61 +76,6 @@ class TacotronTrainTest(unittest.TestCase): count += 1 -class TacotronGSTTrainTest(unittest.TestCase): - def test_train_step(self): - input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) - input_lengths = torch.randint(100, 128, (8, )).long().to(device) - input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - mel_lengths = torch.randint(20, 30, (8, )).long().to(device) - mel_lengths[0] = 30 - stop_targets = torch.zeros(8, 30, 1).float().to(device) - speaker_ids = torch.randint(0, 5, (8, )).long().to(device) - - for idx in mel_lengths: - stop_targets[:, int(idx.item()):, 0] = 1.0 - - stop_targets = stop_targets.view(input_dummy.shape[0], - stop_targets.size(1) // c.r, -1) - stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() - - criterion = MSELossMasked(seq_len_norm=False).to(device) - criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, - gst=True, - r=c.r, - num_speakers=5).to(device) - model.train() - model_ref = copy.deepcopy(model) - count = 0 - for param, param_ref in zip(model.parameters(), - model_ref.parameters()): - assert (param - param_ref).sum() == 0, param - count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) - for i in range(5): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 - optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss - loss.backward() - optimizer.step() - # check parameter changes - count = 0 - for param, param_ref in zip(model.parameters(), - model_ref.parameters()): - # ignore pre-higway layer since it works conditional - # if count not in [145, 59]: - assert (param != param_ref).any( - ), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref) - count += 1 - class MultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): @@ -185,8 +130,8 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): count += 1 class TacotronGSTTrainTest(unittest.TestCase): - @staticmethod - def test_train_step(): + #pylint: disable=no-self-use + def test_train_step(self): # with random gst mel style input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8, )).long().to(device) From 037fc91b2b09ff26dc40cad26f2b505ead90d197 Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 6 Aug 2020 14:23:17 +0200 Subject: [PATCH 56/56] remove redundant arguments --- mozilla_voice_tts/tts/utils/generic_utils.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/mozilla_voice_tts/tts/utils/generic_utils.py b/mozilla_voice_tts/tts/utils/generic_utils.py index fc35840d..f0b718fa 100644 --- a/mozilla_voice_tts/tts/utils/generic_utils.py +++ b/mozilla_voice_tts/tts/utils/generic_utils.py @@ -253,18 +253,6 @@ def check_config(c): check_argument('use_external_speaker_embedding_file', c, restricted=True, val_type=bool) check_argument('external_speaker_embedding_file', c, restricted=True, val_type=str) check_argument('use_gst', c, restricted=True, val_type=bool) - check_argument('gst_style_input', c, restricted=True, val_type=str) - check_argument('gst', c, restricted=True, val_type=dict) - check_argument('gst_embedding_dim', c['gst'], restricted=True, val_type=int, min_val=1) - check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=1) - check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1) - - check_argument('gst', c, restricted=True, val_type=dict) - check_argument('gst_style_input', c['gst'], restricted=True, val_type=[str, dict]) - check_argument('gst_embedding_dim', c['gst'], restricted=True, val_type=int, min_val=0, max_val=1000) - check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=2, max_val=10) - check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1, max_val=1000) - check_argument('gst', c, restricted=True, val_type=dict) check_argument('gst_style_input', c['gst'], restricted=True, val_type=[str, dict]) check_argument('gst_embedding_dim', c['gst'], restricted=True, val_type=int, min_val=0, max_val=1000)