diff --git a/notebooks/VITS_d-vector_multilingual_exemple.ipynb b/notebooks/VITS_d-vector_multilingual_exemple.ipynb deleted file mode 100644 index 41713295..00000000 --- a/notebooks/VITS_d-vector_multilingual_exemple.ipynb +++ /dev/null @@ -1,223 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "source": [ - "import IPython\n", - "import torch\n", - "\n", - "from IPython.display import Audio\n", - "\n", - "from TTS.config import load_config\n", - "from TTS.tts.models import setup_model\n", - "from TTS.tts.utils.synthesis import synthesis\n", - "from TTS.utils.audio import AudioProcessor" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 2, - "source": [ - "GENERAL_PATH = '/home/julian/workspace/train/VITS-pt-en-fr-lr/vits-August-29-2021_01+20PM-c68d7fa25/'\n", - "MODEL_PATH = GENERAL_PATH + 'best_model.pth.tar'\n", - "CONFIG_PATH = GENERAL_PATH + 'config.json'\n", - "TTS_LANGUAGES = GENERAL_PATH + \"language_ids.json\"\n", - "TTS_SPEAKERS = GENERAL_PATH + \"speakers.json\"\n", - "USE_CUDA = torch.cuda.is_available()" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 3, - "source": [ - "# load the config\n", - "C = load_config(CONFIG_PATH)\n", - "\n", - "# load the audio processor\n", - "ap = AudioProcessor(**C.audio)\n", - "\n", - "speaker_embedding = None\n", - "\n", - "C.model_args['d_vector_file'] = TTS_SPEAKERS\n", - "\n", - "model = setup_model(C)\n", - "model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)\n", - "cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n", - "model.load_state_dict(cp['model'])\n", - "\n", - "\n", - "model.eval()\n", - "\n", - "if USE_CUDA:\n", - " model = model.cuda()\n", - "\n", - "use_griffin_lim = True" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - " > Setting up Audio Processor...\n", - " | > sample_rate:16000\n", - " | > resample:False\n", - " | > num_mels:80\n", - " | > min_level_db:-100\n", - " | > frame_shift_ms:None\n", - " | > frame_length_ms:None\n", - " | > ref_level_db:20\n", - " | > fft_size:1024\n", - " | > power:1.5\n", - " | > preemphasis:0.0\n", - " | > griffin_lim_iters:60\n", - " | > signal_norm:False\n", - " | > symmetric_norm:True\n", - " | > mel_fmin:0\n", - " | > mel_fmax:None\n", - " | > spec_gain:1.0\n", - " | > stft_pad_mode:reflect\n", - " | > max_norm:4.0\n", - " | > clip_norm:True\n", - " | > do_trim_silence:True\n", - " | > trim_db:45\n", - " | > do_sound_norm:False\n", - " | > do_amp_to_db_linear:False\n", - " | > do_amp_to_db_mel:True\n", - " | > stats_path:None\n", - " | > base:2.718281828459045\n", - " | > hop_length:256\n", - " | > win_length:1024\n", - " > Using model: vits\n", - " > Speaker manager is loaded with 421 speakers: ED, MLS_10032, MLS_10058, MLS_10065, MLS_10082, MLS_10087, MLS_10177, MLS_103, MLS_10620, MLS_10827, MLS_10957, MLS_112, MLS_11247, MLS_1127, MLS_115, MLS_11743, MLS_11772, MLS_11795, MLS_11822, MLS_11875, MLS_11954, MLS_12205, MLS_123, MLS_1243, MLS_125, MLS_12501, MLS_12512, MLS_12541, MLS_12709, MLS_12713, MLS_12823, MLS_12899, MLS_12968, MLS_12981, MLS_13142, MLS_13177, MLS_1329, MLS_13611, MLS_13634, MLS_13655, MLS_13658, MLS_14, MLS_1474, MLS_1579, MLS_1590, MLS_1591, MLS_1624, MLS_1649, MLS_1664, MLS_1745, MLS_177, MLS_1770, MLS_1798, MLS_1805, MLS_1817, MLS_1840, MLS_1844, MLS_1869, MLS_1887, MLS_1977, MLS_1989, MLS_2033, MLS_204, MLS_2155, MLS_2284, MLS_2297, MLS_2316, MLS_2506, MLS_2544, MLS_2587, MLS_2596, MLS_2607, MLS_27, MLS_2771, MLS_2776, MLS_28, MLS_2825, MLS_2904, MLS_2926, MLS_2946, MLS_30, MLS_3060, MLS_3182, MLS_3190, MLS_3204, MLS_3267, MLS_3270, MLS_3319, MLS_3344, MLS_3370, MLS_3464, MLS_3503, MLS_3595, MLS_3698, MLS_4018, MLS_4174, MLS_4193, MLS_4336, MLS_4396, MLS_4512, MLS_4609, MLS_4650, MLS_4699, MLS_4724, MLS_4744, MLS_4937, MLS_5021, MLS_5077, MLS_52, MLS_5232, MLS_5295, MLS_5525, MLS_5526, MLS_5553, MLS_5595, MLS_5612, MLS_5764, MLS_577, MLS_579, MLS_5830, MLS_5840, MLS_5968, MLS_6070, MLS_6128, MLS_62, MLS_6249, MLS_6318, MLS_6348, MLS_6362, MLS_6381, MLS_66, MLS_6856, MLS_694, MLS_7032, MLS_707, MLS_7142, MLS_7150, MLS_7193, MLS_7200, MLS_7239, MLS_7377, MLS_7423, MLS_7438, MLS_7439, MLS_753, MLS_7591, MLS_7601, MLS_7614, MLS_7679, MLS_78, MLS_7848, MLS_8102, MLS_8128, MLS_8582, MLS_8778, MLS_9121, MLS_9242, MLS_928, MLS_94, MLS_9804, MLS_9854, VCTK_p225, VCTK_p226, VCTK_p227, VCTK_p228, VCTK_p229, VCTK_p230, VCTK_p231, VCTK_p232, VCTK_p233, VCTK_p234, VCTK_p236, VCTK_p237, VCTK_p238, VCTK_p239, VCTK_p240, VCTK_p241, VCTK_p243, VCTK_p244, VCTK_p245, VCTK_p246, VCTK_p247, VCTK_p248, VCTK_p249, VCTK_p250, VCTK_p251, VCTK_p252, VCTK_p253, VCTK_p254, VCTK_p255, VCTK_p256, VCTK_p257, VCTK_p258, VCTK_p259, VCTK_p260, VCTK_p261, VCTK_p262, VCTK_p263, VCTK_p264, VCTK_p265, VCTK_p266, VCTK_p267, VCTK_p268, VCTK_p269, VCTK_p270, VCTK_p271, VCTK_p272, VCTK_p273, VCTK_p274, VCTK_p275, VCTK_p276, VCTK_p277, VCTK_p278, VCTK_p279, VCTK_p280, VCTK_p281, VCTK_p282, VCTK_p283, VCTK_p284, VCTK_p285, VCTK_p286, VCTK_p287, VCTK_p288, VCTK_p292, VCTK_p293, VCTK_p294, VCTK_p295, VCTK_p297, VCTK_p298, VCTK_p299, VCTK_p300, VCTK_p301, VCTK_p302, VCTK_p303, VCTK_p304, VCTK_p305, VCTK_p306, VCTK_p307, VCTK_p308, VCTK_p310, VCTK_p311, VCTK_p312, VCTK_p313, VCTK_p314, VCTK_p316, VCTK_p317, VCTK_p318, VCTK_p323, VCTK_p326, VCTK_p329, VCTK_p330, VCTK_p333, VCTK_p334, VCTK_p335, VCTK_p336, VCTK_p339, VCTK_p340, VCTK_p341, VCTK_p343, VCTK_p345, VCTK_p347, VCTK_p351, VCTK_p360, VCTK_p361, VCTK_p362, VCTK_p363, VCTK_p364, VCTK_p374, VCTK_p376, bernard, elodie, ezwa, gilles_g_le_blanc, nadine_eckert_boulet, openSLR_afr0184, openSLR_afr1919, openSLR_afr2418, openSLR_afr6590, openSLR_afr7130, openSLR_afr7214, openSLR_afr8148, openSLR_afr8924, openSLR_afr8963, openSLR_jvf00264, openSLR_jvf00658, openSLR_jvf01392, openSLR_jvf02059, openSLR_jvf02884, openSLR_jvf03187, openSLR_jvf04679, openSLR_jvf04715, openSLR_jvf04982, openSLR_jvf05540, openSLR_jvf06207, openSLR_jvf06510, openSLR_jvf06941, openSLR_jvf07335, openSLR_jvf07638, openSLR_jvf08002, openSLR_jvf08305, openSLR_jvf08736, openSLR_jvf09039, openSLR_jvm00027, openSLR_jvm01519, openSLR_jvm01932, openSLR_jvm02326, openSLR_jvm03314, openSLR_jvm03424, openSLR_jvm03727, openSLR_jvm04175, openSLR_jvm04285, openSLR_jvm04588, openSLR_jvm05219, openSLR_jvm05522, openSLR_jvm05667, openSLR_jvm05970, openSLR_jvm06080, openSLR_jvm06383, openSLR_jvm07765, openSLR_jvm07875, openSLR_jvm08178, openSLR_jvm09724, openSLR_sso0145, openSLR_sso0493, openSLR_sso0806, openSLR_sso1266, openSLR_sso1367, openSLR_sso1801, openSLR_sso2388, openSLR_sso2910, openSLR_sso4592, openSLR_sso5945, openSLR_sso6499, openSLR_sso7801, openSLR_sso7821, openSLR_sso7876, openSLR_sso7912, openSLR_sso7934, openSLR_sso8596, openSLR_sso8777, openSLR_sso9892, openSLR_suf00297, openSLR_suf00600, openSLR_suf00691, openSLR_suf00994, openSLR_suf01056, openSLR_suf01359, openSLR_suf02092, openSLR_suf02395, openSLR_suf02953, openSLR_suf03712, openSLR_suf03887, openSLR_suf04190, openSLR_suf04646, openSLR_suf04748, openSLR_suf05051, openSLR_suf05507, openSLR_suf06543, openSLR_suf07302, openSLR_suf08338, openSLR_suf08703, openSLR_sum00060, openSLR_sum00454, openSLR_sum01038, openSLR_sum01552, openSLR_sum01596, openSLR_sum01855, openSLR_sum01899, openSLR_sum02716, openSLR_sum03391, openSLR_sum03650, openSLR_sum03694, openSLR_sum04208, openSLR_sum04511, openSLR_sum05186, openSLR_sum06003, openSLR_sum06047, openSLR_sum07842, openSLR_sum08659, openSLR_sum09243, openSLR_sum09637, openSLR_sum09757, openSLR_tsn0045, openSLR_tsn0378, openSLR_tsn0441, openSLR_tsn1483, openSLR_tsn1498, openSLR_tsn1932, openSLR_tsn2839, openSLR_tsn3342, openSLR_tsn3629, openSLR_tsn4506, openSLR_tsn4850, openSLR_tsn5628, openSLR_tsn6116, openSLR_tsn6206, openSLR_tsn6234, openSLR_tsn6459, openSLR_tsn7674, openSLR_tsn7693, openSLR_tsn7866, openSLR_tsn7896, openSLR_tsn8333, openSLR_tsn8512, openSLR_tsn8532, openSLR_tsn8914, openSLR_tsn9061, openSLR_tsn9365, openSLR_xho0050, openSLR_xho0120, openSLR_xho1547, openSLR_xho3616, openSLR_xho4280, openSLR_xho4291, openSLR_xho5378, openSLR_xho5680, openSLR_xho6975, openSLR_xho7590, openSLR_xho7599, openSLR_xho9446, zeckou\n" - ] - } - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 4, - "source": [ - "#set speaker\n", - "d_vector = model.speaker_manager.get_mean_d_vector('VCTK_p260')" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 5, - "source": [ - "model.language_manager.language_id_mapping" - ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "{'af': 0,\n", - " 'en': 1,\n", - " 'fr-fr': 2,\n", - " 'jv': 3,\n", - " 'pt-br': 4,\n", - " 'st': 5,\n", - " 'su': 6,\n", - " 'tn': 7,\n", - " 'xh': 8}" - ] - }, - "metadata": {}, - "execution_count": 5 - } - ], - "metadata": { - "scrolled": true - } - }, - { - "cell_type": "code", - "execution_count": 6, - "source": [ - "# set scales \n", - "model.noise_scale = 0.0 # defines the noise variance applied to the random z vector at inference.\n", - "model.length_scale = 1.0 # scaler for the duration predictor. The larger it is, the slower the speech.\n", - "model.noise_scale_w = 0.0 # defines the noise variance applied to the duration predictor z vector at inference.\n", - "model.inference_noise_scale = 0.5 # defines the noise variance applied to the random z vector at inference.\n", - "model.inference_noise_scale_dp = 0.6 # defines the noise variance applied to the duration predictor z vector at inference." - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 7, - "source": [ - "text = \"Il m'a fallu beaucoup de temps pour développer une voix, et maintenant que je l'ai, je ne vais pas me taire.\"\n", - "language_id = 2\n", - "wav, alignment, _, _ = synthesis(\n", - " model,\n", - " text,\n", - " C,\n", - " \"cuda\" in str(next(model.parameters()).device),\n", - " ap,\n", - " speaker_id=None,\n", - " d_vector=d_vector,\n", - " style_wav=None,\n", - " language_id=language_id,\n", - " enable_eos_bos_chars=C.enable_eos_bos_chars,\n", - " use_griffin_lim=True,\n", - " do_trim_silence=False,\n", - " ).values()\n", - "IPython.display.display(Audio(wav, rate=ap.sample_rate))" - ], - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {} - } - ], - "metadata": {} - } - ], - "metadata": { - "interpreter": { - "hash": "b925b73899c1545aa2d9bbcf4e8e1df4138a367d2daefc2707570579325ca4c0" - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3.8.10 64-bit ('TTS': conda)" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file