coqui-tts/notebooks/Tortoise.ipynb

1.8 MiB

None <html lang="en"> <head> </head>
In [1]:
#@title # Setup
# Imports used through the rest of the notebook.
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F

import IPython

from TTS.tts.models.tortoise import TextToSpeech
from TTS.tts.layers.tortoise.audio_utils import load_audio, load_voice, load_voices

# This will download all the models used by Tortoise from the HuggingFace hub.
tts = TextToSpeech()
2023-04-22 16:58:42.388656: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-22 16:58:43.345225: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
In [2]:
# This is the text that will be spoken.
text = "Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?" #@param {type:"string"}
#@markdown Show code for multiline text input
# Here's something for the poetically inclined.. (set text=)
"""
Then took the other, as just as fair,
And having perhaps the better claim,
Because it was grassy and wanted wear;
Though as for that the passing there
Had worn them really about the same,"""

# Pick a "preset mode" to determine quality. Options: {"ultra_fast", "fast" (default), "standard", "high_quality"}. See docs in api.py
preset = "fast" #@param ["ultra_fast", "fast", "standard", "high_quality"]
In [3]:
%ls ../TTS/tts/utils/assets/tortoise/voices/
import IPython
IPython.display.Audio(filename='../TTS/tts/utils/assets/tortoise/voices/tom/1.wav')
angie/                freeman/  myself/        tom/            train_grace/
applejack/            geralt/   pat/           train_atkins/   train_kennard/
cond_latent_example/  halle/    pat2/          train_daws/     train_lescault/
daniel/               jlaw/     rainbow/       train_dotrice/  train_mouse/
deniro/               lj/       snakes/        train_dreams/   weaver/
emma/                 mol/      tim_reynolds/  train_empire/   william/
Out[3]:
Your browser does not support the audio element.
In [4]:
#@markdown Pick one of the voices from the output above
voice = 'tom' #@param {type:"string"}

#@markdown Load it and send it through Tortoise.
voice_samples, conditioning_latents = load_voice(voice)
gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, 
                          preset=preset)
torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)
IPython.display.Audio('generated.wav')
mode 0
/home/manmay/anaconda3/envs/tts/lib/python3.8/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /opt/conda/conda-bld/pytorch_1678402379298/work/aten/src/ATen/native/SpectralOps.cpp:862.)
  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]
Generating autoregressive samples..
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [01:05<00:00,  4.11s/it]
Computing best candidates using CLVP
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:05<00:00,  3.14it/s]
Transforming autoregressive outputs into audio..
  0%|          | 0/50 [00:00<?, ?it/s]
Out[4]:
Your browser does not support the audio element.
In [ ]:
 
</html>