coqui-tts/ExtractTTSpectrogram.ipynb at c713a839da5b817ffc1d87d8a470c3126e551cbf

12 KiB

Raw Blame History

None <html lang="en"> <head> </head>

This is a notebook to generate mel-spectrograms from a TTS model to be used in a Vocoder training.

In [ ]:

import os
import sys
import torch
import importlib
import numpy as np
from tqdm import tqdm
from torch.utils.data import DataLoader
import soundfile as sf
import pickle
from TTS.tts.datasets.dataset import TTSDataset
from TTS.tts.layers.losses import L1LossMasked
from TTS.utils.audio import AudioProcessor
from TTS.config import load_config
from TTS.tts.utils.visual import plot_spectrogram
from TTS.tts.utils.helpers import sequence_mask
from TTS.tts.models import setup_model
from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes

%matplotlib inline

# Configure CUDA visibility
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [ ]:

# Function to create directories and file names
def set_filename(wav_path, out_path):
    wav_file = os.path.basename(wav_path)
    file_name = wav_file.split('.')[0]
    os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
    os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
    os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
    wavq_path = os.path.join(out_path, "quant", file_name)
    mel_path = os.path.join(out_path, "mel", file_name)
    wav_path = os.path.join(out_path, "wav_gl", file_name)
    return file_name, wavq_path, mel_path, wav_path

In [ ]:

# Paths and configurations
OUT_PATH = "/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/specs2/"
DATA_PATH = "/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/"
DATASET = "ljspeech"
METADATA_FILE = "metadata.csv"
CONFIG_PATH = "/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/config.json"
MODEL_FILE = "/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/model_file.pth"
BATCH_SIZE = 32

QUANTIZED_WAV = False
QUANTIZE_BIT = None
DRY_RUN = False   # if False, does not generate output files, only computes loss and visuals.

# Check CUDA availability
use_cuda = torch.cuda.is_available()
print(" > CUDA enabled: ", use_cuda)

# Load the configuration
C = load_config(CONFIG_PATH)
C.audio['do_trim_silence'] = False  # IMPORTANT!!!!!!!!!!!!!!! disable to align mel specs with the wav files
ap = AudioProcessor(bits=QUANTIZE_BIT, **C.audio)
print(C['r'])

In [ ]:

# If the vocabulary was passed, replace the default
if 'characters' in C and C['characters']:
    symbols, phonemes = make_symbols(**C.characters)

# Load the model
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
# TODO: multiple speakers
model = setup_model(C)
model.load_checkpoint(C, MODEL_FILE, eval=True)

In [ ]:

# Load the preprocessor based on the dataset
preprocessor = importlib.import_module("TTS.tts.datasets.formatters")
preprocessor = getattr(preprocessor, DATASET.lower())
meta_data = preprocessor(DATA_PATH, METADATA_FILE)
dataset = TTSDataset(
    C,
    C.text_cleaner,
    False,
    ap,
    meta_data,
    characters=C.get('characters', None),
    use_phonemes=C.use_phonemes,
    phoneme_cache_path=C.phoneme_cache_path,
    enable_eos_bos=C.enable_eos_bos_chars,
)
loader = DataLoader(
    dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False
)

In [ ]:

# Initialize lists for storing results
file_idxs = []
metadata = []
losses = []
postnet_losses = []
criterion = L1LossMasked(seq_len_norm=C.seq_len_norm)

# Create log file
log_file_path = os.path.join(OUT_PATH, "log.txt")
log_file = open(log_file_path, "w")

Generate model outputs¶

In [ ]:

# Start processing with a progress bar
with torch.no_grad():
    for data in tqdm(loader, desc="Processing"):
        try:
            # setup input data
            text_input, text_lengths, _, linear_input, mel_input, mel_lengths, stop_targets, item_idx = data

            # dispatch data to GPU
            if use_cuda:
                text_input = text_input.cuda()
                text_lengths = text_lengths.cuda()
                mel_input = mel_input.cuda()
                mel_lengths = mel_lengths.cuda()

            mask = sequence_mask(text_lengths)
            mel_outputs, postnet_outputs, alignments, stop_tokens = model.forward(text_input, text_lengths, mel_input)

            # compute loss
            loss = criterion(mel_outputs, mel_input, mel_lengths)
            loss_postnet = criterion(postnet_outputs, mel_input, mel_lengths)
            losses.append(loss.item())
            postnet_losses.append(loss_postnet.item())

            # compute mel specs from linear spec if the model is Tacotron
            if C.model == "Tacotron":
                mel_specs = []
                postnet_outputs = postnet_outputs.data.cpu().numpy()
                for b in range(postnet_outputs.shape[0]):
                    postnet_output = postnet_outputs[b]
                    mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T).cuda())
                postnet_outputs = torch.stack(mel_specs)
            elif C.model == "Tacotron2":
                postnet_outputs = postnet_outputs.detach().cpu().numpy()
            alignments = alignments.detach().cpu().numpy()

            if not DRY_RUN:
                for idx in range(text_input.shape[0]):
                    wav_file_path = item_idx[idx]
                    wav = ap.load_wav(wav_file_path)
                    file_name, wavq_path, mel_path, wav_path = set_filename(wav_file_path, OUT_PATH)
                    file_idxs.append(file_name)

                    # quantize and save wav
                    if QUANTIZED_WAV:
                        wavq = ap.quantize(wav)
                        np.save(wavq_path, wavq)

                    # save TTS mel
                    mel = postnet_outputs[idx]
                    mel_length = mel_lengths[idx]
                    mel = mel[:mel_length, :].T
                    np.save(mel_path, mel)

                    metadata.append([wav_file_path, mel_path])

        except Exception as e:
            log_file.write(f"Error processing data: {str(e)}\n")

    # Calculate and log mean losses
    mean_loss = np.mean(losses)
    mean_postnet_loss = np.mean(postnet_losses)
    log_file.write(f"Mean Loss: {mean_loss}\n")
    log_file.write(f"Mean Postnet Loss: {mean_postnet_loss}\n")

# Close the log file
log_file.close()

# For wavernn
if not DRY_RUN:
    pickle.dump(file_idxs, open(os.path.join(OUT_PATH, "dataset_ids.pkl"), "wb"))

# For pwgan
with open(os.path.join(OUT_PATH, "metadata.txt"), "w") as f:
    for data in metadata:
        f.write(f"{data[0]}|{data[1]+'.npy'}\n")

# Print mean losses
print(f"Mean Loss: {mean_loss}")
print(f"Mean Postnet Loss: {mean_postnet_loss}")

In [ ]:

# for pwgan
with open(os.path.join(OUT_PATH, "metadata.txt"), "w") as f:
    for data in metadata:
        f.write(f"{data[0]}|{data[1]+'.npy'}\n")

Sanity Check¶

In [ ]:

idx = 1
ap.melspectrogram(ap.load_wav(item_idx[idx])).shape

In [ ]:

import soundfile as sf
wav, sr = sf.read(item_idx[idx])
mel_postnet = postnet_outputs[idx][:mel_lengths[idx], :]
mel_decoder = mel_outputs[idx][:mel_lengths[idx], :].detach().cpu().numpy()
mel_truth = ap.melspectrogram(wav)
print(mel_truth.shape)

In [ ]:

# plot posnet output
print(mel_postnet[:mel_lengths[idx], :].shape)
plot_spectrogram(mel_postnet, ap)

In [ ]:

# plot decoder output
print(mel_decoder.shape)
plot_spectrogram(mel_decoder, ap)

In [ ]:

# plot GT specgrogram
print(mel_truth.shape)
plot_spectrogram(mel_truth.T, ap)

In [ ]:

# postnet, decoder diff
from matplotlib import pylab as plt
mel_diff = mel_decoder - mel_postnet
plt.figure(figsize=(16, 10))
plt.imshow(abs(mel_diff[:mel_lengths[idx],:]).T,aspect="auto", origin="lower");
plt.colorbar()
plt.tight_layout()

In [ ]:

# PLOT GT SPECTROGRAM diff
from matplotlib import pylab as plt
mel_diff2 = mel_truth.T - mel_decoder
plt.figure(figsize=(16, 10))
plt.imshow(abs(mel_diff2).T,aspect="auto", origin="lower");
plt.colorbar()
plt.tight_layout()

In [ ]:

# PLOT GT SPECTROGRAM diff
from matplotlib import pylab as plt
mel = postnet_outputs[idx]
mel_diff2 = mel_truth.T - mel[:mel_truth.shape[1]]
plt.figure(figsize=(16, 10))
plt.imshow(abs(mel_diff2).T,aspect="auto", origin="lower");
plt.colorbar()
plt.tight_layout()

In [ ]:

</html>

12 KiB Raw Blame History

Generate model outputs¶

Sanity Check¶

12 KiB

Raw Blame History