mirror of https://github.com/coqui-ai/TTS.git
small fixes and test fixes
This commit is contained in:
parent
a6259041d3
commit
d382d759b3
|
@ -514,7 +514,7 @@ if __name__ == "__main__":
|
||||||
new_fields["restore_path"] = args.restore_path
|
new_fields["restore_path"] = args.restore_path
|
||||||
new_fields["github_branch"] = get_git_branch()
|
new_fields["github_branch"] = get_git_branch()
|
||||||
copy_model_files(
|
copy_model_files(
|
||||||
args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields
|
c, args.config_path, OUT_PATH, new_fields
|
||||||
)
|
)
|
||||||
os.chmod(AUDIO_PATH, 0o775)
|
os.chmod(AUDIO_PATH, 0o775)
|
||||||
os.chmod(OUT_PATH, 0o775)
|
os.chmod(OUT_PATH, 0o775)
|
||||||
|
|
|
@ -122,7 +122,7 @@ class Encoder(nn.Module):
|
||||||
num_layers=3,
|
num_layers=3,
|
||||||
dropout_p=0.5)
|
dropout_p=0.5)
|
||||||
# text encoder
|
# text encoder
|
||||||
self.encoder = Transformer(self.hidden_channels, **encoder_params) # pylint: disable=unexpected-keyword-arg
|
self.encoder = RelativePositionTransformer(self.hidden_channels, **encoder_params) # pylint: disable=unexpected-keyword-arg
|
||||||
elif encoder_type.lower() == 'residual_conv_bn':
|
elif encoder_type.lower() == 'residual_conv_bn':
|
||||||
self.pre = nn.Sequential(
|
self.pre = nn.Sequential(
|
||||||
nn.Conv1d(self.in_channels, self.hidden_channels, 1),
|
nn.Conv1d(self.in_channels, self.hidden_channels, 1),
|
||||||
|
|
|
@ -169,7 +169,6 @@ class SpeedySpeech(nn.Module):
|
||||||
dr: [B, T_max]
|
dr: [B, T_max]
|
||||||
g: [B, C]
|
g: [B, C]
|
||||||
"""
|
"""
|
||||||
breakpoint()
|
|
||||||
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
||||||
o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
|
o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
|
||||||
o_de, attn= self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g)
|
o_de, attn= self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g)
|
||||||
|
|
|
@ -86,7 +86,19 @@
|
||||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||||
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
|
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
|
||||||
|
|
||||||
"encoder_type": "transformer",
|
"hidden_channels_encoder": 192,
|
||||||
|
"hidden_channels_decoder": 192,
|
||||||
|
"hidden_channels_duration_predictor": 256,
|
||||||
|
"use_encoder_prenet": true,
|
||||||
|
"encoder_type": "rel_pos_transformer",
|
||||||
|
"encoder_params": {
|
||||||
|
"kernel_size":3,
|
||||||
|
"dropout_p": 0.1,
|
||||||
|
"num_layers": 6,
|
||||||
|
"num_heads": 2,
|
||||||
|
"hidden_channels_ffn": 768,
|
||||||
|
"input_length": null
|
||||||
|
},
|
||||||
|
|
||||||
// TENSORBOARD and LOGGING
|
// TENSORBOARD and LOGGING
|
||||||
"print_step": 25, // Number of steps to log training on console.
|
"print_step": 25, // Number of steps to log training on console.
|
||||||
|
@ -107,6 +119,7 @@
|
||||||
"max_seq_len": 500, // DATASET-RELATED: maximum text length
|
"max_seq_len": 500, // DATASET-RELATED: maximum text length
|
||||||
"compute_f0": false, // compute f0 values in data-loader
|
"compute_f0": false, // compute f0 values in data-loader
|
||||||
"compute_input_seq_cache": true,
|
"compute_input_seq_cache": true,
|
||||||
|
"use_noise_augment": true,
|
||||||
|
|
||||||
// PATHS
|
// PATHS
|
||||||
"output_path": "tests/train_outputs/",
|
"output_path": "tests/train_outputs/",
|
||||||
|
|
|
@ -64,6 +64,8 @@
|
||||||
|
|
||||||
// MODEL PARAMETERS
|
// MODEL PARAMETERS
|
||||||
"positional_encoding": true,
|
"positional_encoding": true,
|
||||||
|
"hidden_channels": 128,
|
||||||
|
"encoder_type": "residual_conv_bn",
|
||||||
"encoder_type": "residual_conv_bn",
|
"encoder_type": "residual_conv_bn",
|
||||||
"encoder_params":{
|
"encoder_params":{
|
||||||
"kernel_size": 4,
|
"kernel_size": 4,
|
||||||
|
@ -71,13 +73,15 @@
|
||||||
"num_conv_blocks": 2,
|
"num_conv_blocks": 2,
|
||||||
"num_res_blocks": 13
|
"num_res_blocks": 13
|
||||||
},
|
},
|
||||||
"decoder_residual_conv_bn_params":{
|
"decoder_type": "residual_conv_bn",
|
||||||
|
"decoder_params":{
|
||||||
"kernel_size": 4,
|
"kernel_size": 4,
|
||||||
"dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1],
|
"dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1],
|
||||||
"num_conv_blocks": 2,
|
"num_conv_blocks": 2,
|
||||||
"num_res_blocks": 17
|
"num_res_blocks": 17
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|
||||||
// TRAINING
|
// TRAINING
|
||||||
"batch_size":64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
"batch_size":64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||||
"eval_batch_size":32,
|
"eval_batch_size":32,
|
||||||
|
|
|
@ -42,14 +42,22 @@ class GlowTTSTrainTest(unittest.TestCase):
|
||||||
criterion = criterion = GlowTTSLoss()
|
criterion = criterion = GlowTTSLoss()
|
||||||
|
|
||||||
# model to train
|
# model to train
|
||||||
model = GlowTts(num_chars=32,
|
model = GlowTts(
|
||||||
hidden_channels=128,
|
num_chars=32,
|
||||||
hidden_channels_ffn=32,
|
hidden_channels_enc=128,
|
||||||
|
hidden_channels_dec=128,
|
||||||
hidden_channels_dp=32,
|
hidden_channels_dp=32,
|
||||||
out_channels=80,
|
out_channels=80,
|
||||||
num_heads=2,
|
encoder_type='rel_pos_transformer',
|
||||||
num_layers_enc=6,
|
encoder_params={
|
||||||
dropout_p=0.1,
|
'kernel_size': 3,
|
||||||
|
'dropout_p': 0.1,
|
||||||
|
'num_layers': 6,
|
||||||
|
'num_heads': 2,
|
||||||
|
'hidden_channels_ffn': 768, # 4 times the hidden_channels
|
||||||
|
'input_length': None
|
||||||
|
},
|
||||||
|
use_encoder_prenet=True,
|
||||||
num_flow_blocks_dec=12,
|
num_flow_blocks_dec=12,
|
||||||
kernel_size_dec=5,
|
kernel_size_dec=5,
|
||||||
dilation_rate=5,
|
dilation_rate=5,
|
||||||
|
@ -58,25 +66,27 @@ class GlowTTSTrainTest(unittest.TestCase):
|
||||||
num_speakers=0,
|
num_speakers=0,
|
||||||
c_in_channels=0,
|
c_in_channels=0,
|
||||||
num_splits=4,
|
num_splits=4,
|
||||||
num_sqz=1,
|
num_squeeze=1,
|
||||||
sigmoid_scale=False,
|
sigmoid_scale=False,
|
||||||
rel_attn_window_size=None,
|
mean_only=False).to(device)
|
||||||
input_length=None,
|
|
||||||
mean_only=False,
|
|
||||||
hidden_channels_enc=None,
|
|
||||||
hidden_channels_dec=None,
|
|
||||||
use_encoder_prenet=False,
|
|
||||||
encoder_type="transformer").to(device)
|
|
||||||
|
|
||||||
# reference model to compare model weights
|
# reference model to compare model weights
|
||||||
model_ref = GlowTts(num_chars=32,
|
model_ref = GlowTts(
|
||||||
hidden_channels=128,
|
num_chars=32,
|
||||||
hidden_channels_ffn=32,
|
hidden_channels_enc=128,
|
||||||
|
hidden_channels_dec=128,
|
||||||
hidden_channels_dp=32,
|
hidden_channels_dp=32,
|
||||||
out_channels=80,
|
out_channels=80,
|
||||||
num_heads=2,
|
encoder_type='rel_pos_transformer',
|
||||||
num_layers_enc=6,
|
encoder_params={
|
||||||
dropout_p=0.1,
|
'kernel_size': 3,
|
||||||
|
'dropout_p': 0.1,
|
||||||
|
'num_layers': 6,
|
||||||
|
'num_heads': 2,
|
||||||
|
'hidden_channels_ffn': 768, # 4 times the hidden_channels
|
||||||
|
'input_length': None
|
||||||
|
},
|
||||||
|
use_encoder_prenet=True,
|
||||||
num_flow_blocks_dec=12,
|
num_flow_blocks_dec=12,
|
||||||
kernel_size_dec=5,
|
kernel_size_dec=5,
|
||||||
dilation_rate=5,
|
dilation_rate=5,
|
||||||
|
@ -85,15 +95,9 @@ class GlowTTSTrainTest(unittest.TestCase):
|
||||||
num_speakers=0,
|
num_speakers=0,
|
||||||
c_in_channels=0,
|
c_in_channels=0,
|
||||||
num_splits=4,
|
num_splits=4,
|
||||||
num_sqz=1,
|
num_squeeze=1,
|
||||||
sigmoid_scale=False,
|
sigmoid_scale=False,
|
||||||
rel_attn_window_size=None,
|
mean_only=False).to(device)
|
||||||
input_length=None,
|
|
||||||
mean_only=False,
|
|
||||||
hidden_channels_enc=None,
|
|
||||||
hidden_channels_dec=None,
|
|
||||||
use_encoder_prenet=False,
|
|
||||||
encoder_type="transformer").to(device)
|
|
||||||
|
|
||||||
model.train()
|
model.train()
|
||||||
print(" > Num parameters for GlowTTS model:%s" %
|
print(" > Num parameters for GlowTTS model:%s" %
|
||||||
|
|
Loading…
Reference in New Issue