From 2df9bfa78eb338d1b0972c25f4d236403b4e032d Mon Sep 17 00:00:00 2001
From: Enno Hermann <Eginhard@users.noreply.github.com>
Date: Sat, 9 Nov 2024 18:37:08 +0100
Subject: [PATCH] refactor: handle deprecation of torch.cuda.amp.autocast
 (#144)

torch.cuda.amp.autocast(args...) and torch.cpu.amp.autocast(args...) will be
deprecated. Please use torch.autocast("cuda", args...) or torch.autocast("cpu",
args...) instead.

https://pytorch.org/docs/stable/amp.html
---
 TTS/encoder/models/lstm.py                   | 2 +-
 TTS/tts/layers/bark/load_model.py            | 9 ++-------
 TTS/tts/layers/tortoise/diffusion_decoder.py | 3 +--
 TTS/tts/models/delightful_tts.py             | 7 +++----
 TTS/tts/models/forward_tts.py                | 3 +--
 TTS/tts/models/glow_tts.py                   | 3 +--
 TTS/tts/models/tacotron.py                   | 3 +--
 TTS/tts/models/tacotron2.py                  | 3 +--
 TTS/tts/models/vits.py                       | 7 +++----
 9 files changed, 14 insertions(+), 26 deletions(-)

diff --git a/TTS/encoder/models/lstm.py b/TTS/encoder/models/lstm.py
index 51852b5b..4e0a7523 100644
--- a/TTS/encoder/models/lstm.py
+++ b/TTS/encoder/models/lstm.py
@@ -86,7 +86,7 @@ class LSTMSpeakerEncoder(BaseEncoder):
             - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
         """
         with torch.no_grad():
-            with torch.cuda.amp.autocast(enabled=False):
+            with torch.autocast("cuda", enabled=False):
                 if self.use_torch_spec:
                     x.squeeze_(1)
                     x = self.torch_spec(x)
diff --git a/TTS/tts/layers/bark/load_model.py b/TTS/tts/layers/bark/load_model.py
index 72eca30a..6b7caab9 100644
--- a/TTS/tts/layers/bark/load_model.py
+++ b/TTS/tts/layers/bark/load_model.py
@@ -12,13 +12,8 @@ from TTS.tts.layers.bark.model import GPT, GPTConfig
 from TTS.tts.layers.bark.model_fine import FineGPT, FineGPTConfig
 from TTS.utils.generic_utils import is_pytorch_at_least_2_4
 
-if (
-    torch.cuda.is_available()
-    and hasattr(torch.cuda, "amp")
-    and hasattr(torch.cuda.amp, "autocast")
-    and torch.cuda.is_bf16_supported()
-):
-    autocast = functools.partial(torch.cuda.amp.autocast, dtype=torch.bfloat16)
+if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+    autocast = functools.partial(torch.autocast, device_type="cuda", dtype=torch.bfloat16)
 else:
 
     @contextlib.contextmanager
diff --git a/TTS/tts/layers/tortoise/diffusion_decoder.py b/TTS/tts/layers/tortoise/diffusion_decoder.py
index 0d3cf769..f71eaf17 100644
--- a/TTS/tts/layers/tortoise/diffusion_decoder.py
+++ b/TTS/tts/layers/tortoise/diffusion_decoder.py
@@ -5,7 +5,6 @@ from abc import abstractmethod
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch import autocast
 
 from TTS.tts.layers.tortoise.arch_utils import AttentionBlock, normalization
 
@@ -385,7 +384,7 @@ class DiffusionTts(nn.Module):
                 unused_params.extend(list(lyr.parameters()))
             else:
                 # First and last blocks will have autocast disabled for improved precision.
-                with autocast(x.device.type, enabled=self.enable_fp16 and i != 0):
+                with torch.autocast(x.device.type, enabled=self.enable_fp16 and i != 0):
                     x = lyr(x, time_emb)
 
         x = x.float()
diff --git a/TTS/tts/models/delightful_tts.py b/TTS/tts/models/delightful_tts.py
index a938a3a4..c6f15a79 100644
--- a/TTS/tts/models/delightful_tts.py
+++ b/TTS/tts/models/delightful_tts.py
@@ -12,7 +12,6 @@ import torchaudio
 from coqpit import Coqpit
 from librosa.filters import mel as librosa_mel_fn
 from torch import nn
-from torch.cuda.amp.autocast_mode import autocast
 from torch.nn import functional as F
 from torch.utils.data import DataLoader
 from torch.utils.data.sampler import WeightedRandomSampler
@@ -952,7 +951,7 @@ class DelightfulTTS(BaseTTSE2E):
                 )
 
                 # compute loss
-                with autocast(enabled=False):  # use float32 for the criterion
+                with torch.autocast("cuda", enabled=False):  # use float32 for the criterion
                     loss_dict = criterion[optimizer_idx](
                         scores_disc_fake=scores_d_fake,
                         scores_disc_real=scores_d_real,
@@ -963,7 +962,7 @@ class DelightfulTTS(BaseTTSE2E):
         if optimizer_idx == 1:
             mel = batch["mel_input"]
             # compute melspec segment
-            with autocast(enabled=False):
+            with torch.autocast("cuda", enabled=False):
                 mel_slice = segment(
                     mel.float(), self.model_outputs_cache["slice_ids"], self.args.spec_segment_size, pad_short=True
                 )
@@ -991,7 +990,7 @@ class DelightfulTTS(BaseTTSE2E):
                 )
 
             # compute losses
-            with autocast(enabled=True):  # use float32 for the criterion
+            with torch.autocast("cuda", enabled=True):  # use float32 for the criterion
                 loss_dict = criterion[optimizer_idx](
                     mel_output=self.model_outputs_cache["acoustic_model_outputs"].transpose(1, 2),
                     mel_target=batch["mel_input"],
diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py
index e7bc8637..d449e580 100644
--- a/TTS/tts/models/forward_tts.py
+++ b/TTS/tts/models/forward_tts.py
@@ -6,7 +6,6 @@ import torch
 from coqpit import Coqpit
 from monotonic_alignment_search import maximum_path
 from torch import nn
-from torch.cuda.amp.autocast_mode import autocast
 from trainer.io import load_fsspec
 
 from TTS.tts.layers.feed_forward.decoder import Decoder
@@ -744,7 +743,7 @@ class ForwardTTS(BaseTTS):
         if self.use_aligner:
             durations = outputs["o_alignment_dur"]
         # use float32 in AMP
-        with autocast(enabled=False):
+        with torch.autocast("cuda", enabled=False):
             # compute loss
             loss_dict = criterion(
                 decoder_output=outputs["model_outputs"],
diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py
index 5ea69865..5bf47131 100644
--- a/TTS/tts/models/glow_tts.py
+++ b/TTS/tts/models/glow_tts.py
@@ -6,7 +6,6 @@ import torch
 from coqpit import Coqpit
 from monotonic_alignment_search import maximum_path
 from torch import nn
-from torch.cuda.amp.autocast_mode import autocast
 from torch.nn import functional as F
 from trainer.io import load_fsspec
 
@@ -416,7 +415,7 @@ class GlowTTS(BaseTTS):
                 aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids},
             )
 
-            with autocast(enabled=False):  # avoid mixed_precision in criterion
+            with torch.autocast("cuda", enabled=False):  # avoid mixed_precision in criterion
                 loss_dict = criterion(
                     outputs["z"].float(),
                     outputs["y_mean"].float(),
diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py
index 400a86d0..5d3efd20 100644
--- a/TTS/tts/models/tacotron.py
+++ b/TTS/tts/models/tacotron.py
@@ -4,7 +4,6 @@ from typing import Dict, List, Tuple, Union
 
 import torch
 from torch import nn
-from torch.cuda.amp.autocast_mode import autocast
 from trainer.trainer_utils import get_optimizer, get_scheduler
 
 from TTS.tts.layers.tacotron.capacitron_layers import CapacitronVAE
@@ -310,7 +309,7 @@ class Tacotron(BaseTacotron):
             alignment_lengths = mel_lengths // self.decoder.r
 
         # compute loss
-        with autocast(enabled=False):  # use float32 for the criterion
+        with torch.autocast("cuda", enabled=False):  # use float32 for the criterion
             loss_dict = criterion(
                 outputs["model_outputs"].float(),
                 outputs["decoder_outputs"].float(),
diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py
index 4b1317f4..2716a397 100644
--- a/TTS/tts/models/tacotron2.py
+++ b/TTS/tts/models/tacotron2.py
@@ -4,7 +4,6 @@ from typing import Dict, List, Union
 
 import torch
 from torch import nn
-from torch.cuda.amp.autocast_mode import autocast
 from trainer.trainer_utils import get_optimizer, get_scheduler
 
 from TTS.tts.layers.tacotron.capacitron_layers import CapacitronVAE
@@ -338,7 +337,7 @@ class Tacotron2(BaseTacotron):
             alignment_lengths = mel_lengths // self.decoder.r
 
         # compute loss
-        with autocast(enabled=False):  # use float32 for the criterion
+        with torch.autocast("cuda", enabled=False):  # use float32 for the criterion
             loss_dict = criterion(
                 outputs["model_outputs"].float(),
                 outputs["decoder_outputs"].float(),
diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index af803a0f..432b29f5 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -13,7 +13,6 @@ from coqpit import Coqpit
 from librosa.filters import mel as librosa_mel_fn
 from monotonic_alignment_search import maximum_path
 from torch import nn
-from torch.cuda.amp.autocast_mode import autocast
 from torch.nn import functional as F
 from torch.utils.data import DataLoader
 from torch.utils.data.sampler import WeightedRandomSampler
@@ -1278,7 +1277,7 @@ class Vits(BaseTTS):
             )
 
             # compute loss
-            with autocast(enabled=False):  # use float32 for the criterion
+            with torch.autocast("cuda", enabled=False):  # use float32 for the criterion
                 loss_dict = criterion[optimizer_idx](
                     scores_disc_real,
                     scores_disc_fake,
@@ -1289,7 +1288,7 @@ class Vits(BaseTTS):
             mel = batch["mel"]
 
             # compute melspec segment
-            with autocast(enabled=False):
+            with torch.autocast("cuda", enabled=False):
                 if self.args.encoder_sample_rate:
                     spec_segment_size = self.spec_segment_size * int(self.interpolate_factor)
                 else:
@@ -1316,7 +1315,7 @@ class Vits(BaseTTS):
             )
 
             # compute losses
-            with autocast(enabled=False):  # use float32 for the criterion
+            with torch.autocast("cuda", enabled=False):  # use float32 for the criterion
                 loss_dict = criterion[optimizer_idx](
                     mel_slice_hat=mel_slice.float(),
                     mel_slice=mel_slice_hat.float(),