mirror of https://github.com/coqui-ai/TTS.git
1.8 MiB
1.8 MiB
None
<html lang="en">
<head>
</head>
</html>
In [1]:
#@title # Setup # Imports used through the rest of the notebook. import torch import torchaudio import torch.nn as nn import torch.nn.functional as F import IPython from TTS.tts.models.tortoise import TextToSpeech from TTS.tts.layers.tortoise.audio_utils import load_audio, load_voice, load_voices # This will download all the models used by Tortoise from the HuggingFace hub. tts = TextToSpeech()
2023-04-22 16:58:42.388656: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. 2023-04-22 16:58:43.345225: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
In [2]:
# This is the text that will be spoken. text = "Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?" #@param {type:"string"} #@markdown Show code for multiline text input # Here's something for the poetically inclined.. (set text=) """ Then took the other, as just as fair, And having perhaps the better claim, Because it was grassy and wanted wear; Though as for that the passing there Had worn them really about the same,""" # Pick a "preset mode" to determine quality. Options: {"ultra_fast", "fast" (default), "standard", "high_quality"}. See docs in api.py preset = "fast" #@param ["ultra_fast", "fast", "standard", "high_quality"]
In [3]:
%ls ../TTS/tts/utils/assets/tortoise/voices/ import IPython IPython.display.Audio(filename='../TTS/tts/utils/assets/tortoise/voices/tom/1.wav')
angie/ freeman/ myself/ tom/ train_grace/ applejack/ geralt/ pat/ train_atkins/ train_kennard/ cond_latent_example/ halle/ pat2/ train_daws/ train_lescault/ daniel/ jlaw/ rainbow/ train_dotrice/ train_mouse/ deniro/ lj/ snakes/ train_dreams/ weaver/ emma/ mol/ tim_reynolds/ train_empire/ william/
Out[3]:
Your browser does not support the audio element.
In [4]:
#@markdown Pick one of the voices from the output above voice = 'tom' #@param {type:"string"} #@markdown Load it and send it through Tortoise. voice_samples, conditioning_latents = load_voice(voice) gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, preset=preset) torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000) IPython.display.Audio('generated.wav')
mode 0
/home/manmay/anaconda3/envs/tts/lib/python3.8/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /opt/conda/conda-bld/pytorch_1678402379298/work/aten/src/ATen/native/SpectralOps.cpp:862.) return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined]
Generating autoregressive samples..
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [01:05<00:00, 4.11s/it]
Computing best candidates using CLVP
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:05<00:00, 3.14it/s]
Transforming autoregressive outputs into audio..
0%| | 0/50 [00:00<?, ?it/s]
Out[4]:
Your browser does not support the audio element.
In [ ]: