diff --git a/TTS/bin/train_vocoder_wavernn.py b/TTS/bin/train_vocoder_wavernn.py index 6bd8e645..cad357dc 100644 --- a/TTS/bin/train_vocoder_wavernn.py +++ b/TTS/bin/train_vocoder_wavernn.py @@ -514,7 +514,7 @@ if __name__ == "__main__": new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() copy_model_files( - args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields + c, args.config_path, OUT_PATH, new_fields ) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) diff --git a/TTS/tts/layers/speedy_speech/encoder.py b/TTS/tts/layers/speedy_speech/encoder.py index 9fd97514..02468626 100644 --- a/TTS/tts/layers/speedy_speech/encoder.py +++ b/TTS/tts/layers/speedy_speech/encoder.py @@ -122,7 +122,7 @@ class Encoder(nn.Module): num_layers=3, dropout_p=0.5) # text encoder - self.encoder = Transformer(self.hidden_channels, **encoder_params) # pylint: disable=unexpected-keyword-arg + self.encoder = RelativePositionTransformer(self.hidden_channels, **encoder_params) # pylint: disable=unexpected-keyword-arg elif encoder_type.lower() == 'residual_conv_bn': self.pre = nn.Sequential( nn.Conv1d(self.in_channels, self.hidden_channels, 1), diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index 11793fff..2e7d0a5f 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -169,7 +169,6 @@ class SpeedySpeech(nn.Module): dr: [B, T_max] g: [B, C] """ - breakpoint() o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) o_de, attn= self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g) diff --git a/tests/inputs/test_glow_tts.json b/tests/inputs/test_glow_tts.json index eb1a9ac4..ff8a81ea 100644 --- a/tests/inputs/test_glow_tts.json +++ b/tests/inputs/test_glow_tts.json @@ -86,7 +86,19 @@ "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. - "encoder_type": "transformer", + "hidden_channels_encoder": 192, + "hidden_channels_decoder": 192, + "hidden_channels_duration_predictor": 256, + "use_encoder_prenet": true, + "encoder_type": "rel_pos_transformer", + "encoder_params": { + "kernel_size":3, + "dropout_p": 0.1, + "num_layers": 6, + "num_heads": 2, + "hidden_channels_ffn": 768, + "input_length": null + }, // TENSORBOARD and LOGGING "print_step": 25, // Number of steps to log training on console. @@ -107,6 +119,7 @@ "max_seq_len": 500, // DATASET-RELATED: maximum text length "compute_f0": false, // compute f0 values in data-loader "compute_input_seq_cache": true, + "use_noise_augment": true, // PATHS "output_path": "tests/train_outputs/", diff --git a/tests/inputs/test_speedy_speech.json b/tests/inputs/test_speedy_speech.json index 40c2d5d9..2a4b3a45 100644 --- a/tests/inputs/test_speedy_speech.json +++ b/tests/inputs/test_speedy_speech.json @@ -64,6 +64,8 @@ // MODEL PARAMETERS "positional_encoding": true, + "hidden_channels": 128, + "encoder_type": "residual_conv_bn", "encoder_type": "residual_conv_bn", "encoder_params":{ "kernel_size": 4, @@ -71,13 +73,15 @@ "num_conv_blocks": 2, "num_res_blocks": 13 }, - "decoder_residual_conv_bn_params":{ + "decoder_type": "residual_conv_bn", + "decoder_params":{ "kernel_size": 4, "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1], "num_conv_blocks": 2, "num_res_blocks": 17 }, + // TRAINING "batch_size":64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. "eval_batch_size":32, diff --git a/tests/test_glow_tts.py b/tests/test_glow_tts.py index 2b365d67..2d375031 100644 --- a/tests/test_glow_tts.py +++ b/tests/test_glow_tts.py @@ -42,58 +42,62 @@ class GlowTTSTrainTest(unittest.TestCase): criterion = criterion = GlowTTSLoss() # model to train - model = GlowTts(num_chars=32, - hidden_channels=128, - hidden_channels_ffn=32, - hidden_channels_dp=32, - out_channels=80, - num_heads=2, - num_layers_enc=6, - dropout_p=0.1, - num_flow_blocks_dec=12, - kernel_size_dec=5, - dilation_rate=5, - num_block_layers=4, - dropout_p_dec=0., - num_speakers=0, - c_in_channels=0, - num_splits=4, - num_sqz=1, - sigmoid_scale=False, - rel_attn_window_size=None, - input_length=None, - mean_only=False, - hidden_channels_enc=None, - hidden_channels_dec=None, - use_encoder_prenet=False, - encoder_type="transformer").to(device) + model = GlowTts( + num_chars=32, + hidden_channels_enc=128, + hidden_channels_dec=128, + hidden_channels_dp=32, + out_channels=80, + encoder_type='rel_pos_transformer', + encoder_params={ + 'kernel_size': 3, + 'dropout_p': 0.1, + 'num_layers': 6, + 'num_heads': 2, + 'hidden_channels_ffn': 768, # 4 times the hidden_channels + 'input_length': None + }, + use_encoder_prenet=True, + num_flow_blocks_dec=12, + kernel_size_dec=5, + dilation_rate=5, + num_block_layers=4, + dropout_p_dec=0., + num_speakers=0, + c_in_channels=0, + num_splits=4, + num_squeeze=1, + sigmoid_scale=False, + mean_only=False).to(device) # reference model to compare model weights - model_ref = GlowTts(num_chars=32, - hidden_channels=128, - hidden_channels_ffn=32, - hidden_channels_dp=32, - out_channels=80, - num_heads=2, - num_layers_enc=6, - dropout_p=0.1, - num_flow_blocks_dec=12, - kernel_size_dec=5, - dilation_rate=5, - num_block_layers=4, - dropout_p_dec=0., - num_speakers=0, - c_in_channels=0, - num_splits=4, - num_sqz=1, - sigmoid_scale=False, - rel_attn_window_size=None, - input_length=None, - mean_only=False, - hidden_channels_enc=None, - hidden_channels_dec=None, - use_encoder_prenet=False, - encoder_type="transformer").to(device) + model_ref = GlowTts( + num_chars=32, + hidden_channels_enc=128, + hidden_channels_dec=128, + hidden_channels_dp=32, + out_channels=80, + encoder_type='rel_pos_transformer', + encoder_params={ + 'kernel_size': 3, + 'dropout_p': 0.1, + 'num_layers': 6, + 'num_heads': 2, + 'hidden_channels_ffn': 768, # 4 times the hidden_channels + 'input_length': None + }, + use_encoder_prenet=True, + num_flow_blocks_dec=12, + kernel_size_dec=5, + dilation_rate=5, + num_block_layers=4, + dropout_p_dec=0., + num_speakers=0, + c_in_channels=0, + num_splits=4, + num_squeeze=1, + sigmoid_scale=False, + mean_only=False).to(device) model.train() print(" > Num parameters for GlowTTS model:%s" %