diff --git a/TTS/tts/utils/generic_utils.py b/TTS/tts/utils/generic_utils.py index ae7fdcbd..3a01f2a2 100644 --- a/TTS/tts/utils/generic_utils.py +++ b/TTS/tts/utils/generic_utils.py @@ -182,13 +182,14 @@ def check_config_tts(c): # loss parameters check_argument('loss_masking', c, restricted=True, val_type=bool) - check_argument('decoder_loss_alpha', c, restricted=True, val_type=float, min_val=0) - check_argument('postnet_loss_alpha', c, restricted=True, val_type=float, min_val=0) - check_argument('postnet_diff_spec_alpha', c, restricted=True, val_type=float, min_val=0) - check_argument('decoder_diff_spec_alpha', c, restricted=True, val_type=float, min_val=0) - check_argument('decoder_ssim_alpha', c, restricted=True, val_type=float, min_val=0) - check_argument('postnet_ssim_alpha', c, restricted=True, val_type=float, min_val=0) - check_argument('ga_alpha', c, restricted=True, val_type=float, min_val=0) + if c['model'].lower() in ['tacotron', 'tacotron2']: + check_argument('decoder_loss_alpha', c, restricted=True, val_type=float, min_val=0) + check_argument('postnet_loss_alpha', c, restricted=True, val_type=float, min_val=0) + check_argument('postnet_diff_spec_alpha', c, restricted=True, val_type=float, min_val=0) + check_argument('decoder_diff_spec_alpha', c, restricted=True, val_type=float, min_val=0) + check_argument('decoder_ssim_alpha', c, restricted=True, val_type=float, min_val=0) + check_argument('postnet_ssim_alpha', c, restricted=True, val_type=float, min_val=0) + check_argument('ga_alpha', c, restricted=True, val_type=float, min_val=0) # validation parameters check_argument('run_eval', c, restricted=True, val_type=bool) diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 9ed2e20e..b530c1ec 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -4,6 +4,7 @@ import os import shutil import subprocess import contextlib +import platform import torch @@ -12,10 +13,10 @@ def set_amp_context(mixed_precision): if mixed_precision: cm = torch.cuda.amp.autocast() else: - if os.python.version<=3.6: + if platform.python_version() <= "3.6.0": cm = contextlib.suppress() else: - cm = nullcontext() + cm = contextlib.nullcontext() return cm diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 99053dfd..8aa84d34 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -271,9 +271,12 @@ class WaveRNN(nn.Module): with torch.no_grad(): if isinstance(mels, np.ndarray): - mels = torch.FloatTensor(mels).unsqueeze(0).to(device) - #mels = torch.FloatTensor(mels).cuda().unsqueeze(0) + mels = torch.FloatTensor(mels).to(device) + + if mels.ndim == 2: + mels = mels.unsqueeze(0) wave_len = (mels.size(-1) - 1) * self.hop_length + mels = self.pad_tensor(mels.transpose( 1, 2), pad=self.pad, side="both") mels, aux = self.upsample(mels.transpose(1, 2)) diff --git a/run_tests.sh b/run_tests.sh index 5cd89564..9abe7803 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -1,3 +1,4 @@ +set -e TF_CPP_MIN_LOG_LEVEL=3 # tests diff --git a/tests/inputs/test_train_config.json b/tests/inputs/test_train_config.json index 2e2d6d46..5b2dff2d 100644 --- a/tests/inputs/test_train_config.json +++ b/tests/inputs/test_train_config.json @@ -67,7 +67,7 @@ "gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. "loss_masking": true, // enable / disable loss masking against the sequence padding. "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. - "apex_amp_level": null, + "mixed_precision": false, // VALIDATION "run_eval": true, @@ -75,14 +75,15 @@ "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. // LOSS SETTINGS - "loss_masking": false, // enable / disable loss masking against the sequence padding. - "decoder_loss_alpha": 0.5, // decoder loss weight. If > 0, it is enabled - "postnet_loss_alpha": 0.25, // postnet loss weight. If > 0, it is enabled - "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. - "decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "decoder_loss_alpha": 0.5, // original decoder loss weight. If > 0, it is enabled + "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled "postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled - "decoder_ssim_alpha": 0.5, // differential spectral loss weight. If > 0, it is enabled - "postnet_ssim_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled + "postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled + "ga_alpha": 5.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples. // OPTIMIZER "noam_schedule": false, // use noam warmup and lr schedule. diff --git a/tests/inputs/test_vocoder_wavegrad.json b/tests/inputs/test_vocoder_wavegrad.json index f7da5980..8fa0bbe1 100644 --- a/tests/inputs/test_vocoder_wavegrad.json +++ b/tests/inputs/test_vocoder_wavegrad.json @@ -55,7 +55,8 @@ [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], - [1, 2, 4, 8]] + [1, 2, 4, 8]], + "use_weight_norm": true }, // DATASET diff --git a/tests/inputs/test_vocoder_wavernn_config.json b/tests/inputs/test_vocoder_wavernn_config.json index 28c0f059..9df32fef 100644 --- a/tests/inputs/test_vocoder_wavernn_config.json +++ b/tests/inputs/test_vocoder_wavernn_config.json @@ -35,7 +35,7 @@ }, // Generating / Synthesizing - "batched": true, + "batched": true, "target_samples": 11000, // target number of samples to be generated in each batch entry "overlap_samples": 550, // number of samples for crossfading between batches @@ -53,12 +53,25 @@ "mode": "mold", // mold [string], gauss [string], bits [int] "mulaw": false, // apply mulaw if mode is bits "padding": 2, // pad the input for resnet to see wider input length - + // DATASET //"use_gta": true, // use computed gta features from the tts model "data_path": "tests/data/ljspeech/wavs/", // path containing training wav files "feature_path": null, // path containing computed features from wav files if null compute them + // MODEL PARAMETERS + "wavernn_model_params": { + "rnn_dims": 512, + "fc_dims": 512, + "compute_dims": 128, + "res_out_dims": 128, + "num_res_blocks": 10, + "use_aux_net": true, + "use_upsample_net": true, + "upsample_factors": [4, 8, 8] // this needs to correctly factorise hop_length + }, + "mixed_precision": false, + // TRAINING "batch_size": 4, // Batch size for training. Lower values than 32 might cause hard to learn attention. "epochs": 1, // total number of epochs to train. @@ -86,7 +99,7 @@ // DATA LOADING "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_val_loader_workers": 4, // number of evaluation data loader processes. - "eval_split_size": 10, // number of samples for testing + "eval_split_size": 10, // number of samples for testing // PATHS "output_path": "tests/train_outputs/" diff --git a/tests/test_glow-tts_train.sh b/tests/test_glow-tts_train.sh index c8dd3e22..add7292d 100755 --- a/tests/test_glow-tts_train.sh +++ b/tests/test_glow-tts_train.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash - +set -xe BASEDIR=$(dirname "$0") echo "$BASEDIR" # run training diff --git a/tests/test_tacotron_model.py b/tests/test_tacotron_model.py index 8309aa58..c56a6565 100644 --- a/tests/test_tacotron_model.py +++ b/tests/test_tacotron_model.py @@ -294,6 +294,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) linear_spec = torch.rand(8, 30, c.audio['fft_size']).to(device) mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + mel_lengths[-1] = mel_spec.size(1) stop_targets = torch.zeros(8, 30, 1).float().to(device) speaker_embeddings = torch.rand(8, 55).to(device) diff --git a/tests/test_tacotron_train.sh b/tests/test_tacotron_train.sh index 55379a1e..8138fb75 100755 --- a/tests/test_tacotron_train.sh +++ b/tests/test_tacotron_train.sh @@ -1,5 +1,6 @@ #!/usr/bin/env bash +set -xe BASEDIR=$(dirname "$0") echo "$BASEDIR" # run training diff --git a/tests/test_tts_train.sh b/tests/test_tts_train.sh index ed0871eb..9268ea96 100755 --- a/tests/test_tts_train.sh +++ b/tests/test_tts_train.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash - +set -xe BASEDIR=$(dirname "$0") echo "$BASEDIR" # run training diff --git a/tests/test_vocoder_gan_train.sh b/tests/test_vocoder_gan_train.sh index 474ef9a7..0ed2b599 100755 --- a/tests/test_vocoder_gan_train.sh +++ b/tests/test_vocoder_gan_train.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash - +set -xe BASEDIR=$(dirname "$0") echo "$BASEDIR" # create run dir diff --git a/tests/test_vocoder_wavegrad_train.sh b/tests/test_vocoder_wavegrad_train.sh index b5e6e451..33ffe865 100755 --- a/tests/test_vocoder_wavegrad_train.sh +++ b/tests/test_vocoder_wavegrad_train.sh @@ -1,9 +1,9 @@ #!/usr/bin/env bash - +set -xe BASEDIR=$(dirname "$0") echo "$BASEDIR" # create run dir -mkdir $BASEDIR/train_outputs +mkdir -p $BASEDIR/train_outputs # run training CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder_wavegrad.py --config_path $BASEDIR/inputs/test_vocoder_wavegrad.json # find the training folder diff --git a/tests/test_vocoder_wavernn.py b/tests/test_vocoder_wavernn.py index ccd71c56..2464cfa3 100644 --- a/tests/test_vocoder_wavernn.py +++ b/tests/test_vocoder_wavernn.py @@ -27,5 +27,5 @@ def test_wavernn(): dummy_y = torch.rand((80, y_size)) output = model(dummy_x, dummy_m) assert np.all(output.shape == (2, 1280, 4 * 256)), output.shape - output = model.generate(dummy_y, True, 5500, 550, False) + output = model.inference(dummy_y, True, 5500, 550) assert np.all(output.shape == (256 * (y_size - 1),)) diff --git a/tests/test_vocoder_wavernn_train.sh b/tests/test_vocoder_wavernn_train.sh index ffa30d40..40e86012 100755 --- a/tests/test_vocoder_wavernn_train.sh +++ b/tests/test_vocoder_wavernn_train.sh @@ -1,9 +1,9 @@ #!/usr/bin/env bash - +set -xe BASEDIR=$(dirname "$0") echo "$BASEDIR" # create run dir -mkdir $BASEDIR/train_outputs +mkdir -p $BASEDIR/train_outputs # run training CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder_wavernn.py --config_path $BASEDIR/inputs/test_vocoder_wavernn_config.json # find the training folder