Update amp custom_fwd, custom_bwd usage for torch 2.4.0 compatibility

fla-org · Aug 25, 2024 · 1a8fc1b · 1a8fc1b
1 parent 3583315
commit 1a8fc1b
Show file tree

Hide file tree

Showing 18 changed files with 60 additions and 67 deletions.
diff --git a/fla/ops/abc/recurrent_fuse.py b/fla/ops/abc/recurrent_fuse.py
@@ -7,9 +7,8 @@
 import torch
 import triton
 import triton.language as tl
-from torch.cuda.amp import custom_bwd, custom_fwd
 
-from fla.utils import contiguous
+from fla.utils import autocast_custom_bwd, autocast_custom_fwd, contiguous
 
 
 @triton.jit
@@ -284,7 +283,7 @@ class FusedRecurrentGatedABCFunction(torch.autograd.Function):
 
     @staticmethod
     @contiguous
-    @custom_fwd
+    @autocast_custom_fwd
     def forward(
         ctx,
         q: torch.Tensor,
@@ -374,7 +373,7 @@ def forward(
 
     @staticmethod
     @contiguous
-    @custom_bwd
+    @autocast_custom_bwd
     def backward(ctx, do, dht=None):
         q, k, v, s, g, qv, hk0, hv0, ok = ctx.saved_tensors
         B, H, T, K, V, M = *q.shape, v.shape[-1], s.shape[-1]

diff --git a/fla/ops/based/chunk_fuse.py b/fla/ops/based/chunk_fuse.py
@@ -5,9 +5,8 @@
 import torch
 import triton
 import triton.language as tl
-from torch.cuda.amp import custom_bwd, custom_fwd
 
-from fla.utils import contiguous
+from fla.utils import autocast_custom_bwd, autocast_custom_fwd, contiguous
 
 # on-the-fly computation without materializing hidden statets into HBMs
 
@@ -305,7 +304,7 @@ class FusedChunkBasedFunction(torch.autograd.Function):
 
     @staticmethod
     @contiguous
-    @custom_fwd
+    @autocast_custom_fwd
     def forward(ctx, q, k, v, scale=1):
         B, H, T, K, V = *k.shape, v.shape[-1]
 
@@ -338,7 +337,7 @@ def forward(ctx, q, k, v, scale=1):
 
     @staticmethod
     @contiguous
-    @custom_bwd
+    @autocast_custom_bwd
     def backward(ctx, do, dz):
         q, k, v = ctx.saved_tensors
         B, H, T, K, V = *k.shape, v.shape[-1]

diff --git a/fla/ops/based/parallel.py b/fla/ops/based/parallel.py
@@ -6,9 +6,8 @@
 import torch
 import triton
 import triton.language as tl
-from torch.cuda.amp import custom_bwd, custom_fwd
 
-from fla.utils import contiguous
+from fla.utils import autocast_custom_bwd, autocast_custom_fwd, contiguous
 
 # Based: An Educational and Effective Sequence Mixer
 # https://hazyresearch.stanford.edu/blog/2023-12-11-zoology2-based
@@ -314,7 +313,7 @@ class ParallelBasedFunction(torch.autograd.Function):
 
     @staticmethod
     @contiguous
-    @custom_fwd
+    @autocast_custom_fwd
     def forward(ctx, q, k, v, scale):
         BTL, BTS = 128, 32
         assert BTL % BTS == 0
@@ -349,7 +348,7 @@ def forward(ctx, q, k, v, scale):
 
     @staticmethod
     @contiguous
-    @custom_bwd
+    @autocast_custom_bwd
     def backward(ctx, do, dz):
         q, k, v = ctx.saved_tensors
         scale = ctx.scale

diff --git a/fla/ops/delta_rule/chunk.py b/fla/ops/delta_rule/chunk.py
@@ -4,11 +4,11 @@
 import torch
 import triton
 import triton.language as tl
-from torch.cuda.amp import custom_bwd, custom_fwd
 
 from fla.ops.delta_rule.wy_fast import (bwd_prepare_wy_repr,
                                         fwd_prepare_wy_repr, fwd_recompute_w_u)
 from fla.ops.utils import contiguous
+from fla.utils import autocast_custom_bwd, autocast_custom_fwd
 
 
 @triton.autotune(
@@ -491,7 +491,7 @@ class ChunkDeltaRuleFunction(torch.autograd.Function):
 
     @staticmethod
     @contiguous
-    @custom_fwd
+    @autocast_custom_fwd
     def forward(ctx, q, k, v, beta, BT, initial_state, output_final_state, checkpoint_level=1):
         # obtain WY representation. u is actually the new v.
         w, u, A = fwd_prepare_wy_repr(k, v, beta, BT)
@@ -512,7 +512,7 @@ def forward(ctx, q, k, v, beta, BT, initial_state, output_final_state, checkpoin
 
     @staticmethod
     @contiguous
-    @custom_bwd
+    @autocast_custom_bwd
     def backward(ctx, do, d_ht=None):
         q, k, v, beta, A, h, v_new, initial_state = ctx.saved_tensors
         BT = ctx.BT

diff --git a/fla/ops/delta_rule/chunk_fuse.py b/fla/ops/delta_rule/chunk_fuse.py
@@ -5,10 +5,9 @@
 import torch
 import triton
 import triton.language as tl
-from torch.cuda.amp import custom_bwd, custom_fwd
 
 from fla.ops.delta_rule.utils import bwd_prepare_wy_repr, fwd_prepare_wy_repr
-from fla.utils import contiguous
+from fla.utils import autocast_custom_bwd, autocast_custom_fwd, contiguous
 
 
 # on-the-fly computation without materializing hidden statets into HBMs
@@ -327,7 +326,7 @@ class FusedChunkDeltaRuleFunction(torch.autograd.Function):
 
     @staticmethod
     @contiguous
-    @custom_fwd
+    @autocast_custom_fwd
     def forward(ctx, q, k, v, beta, BT, initial_state, output_final_state, checkpoint_level=0):
         # lvl=1 will recompute ``fwd_prepare_wy_repr`` for saving memory.
         assert checkpoint_level in [0, 1]
@@ -345,7 +344,7 @@ def forward(ctx, q, k, v, beta, BT, initial_state, output_final_state, checkpoin
 
     @staticmethod
     @contiguous
-    @custom_bwd
+    @autocast_custom_bwd
     def backward(ctx, do, d_final_state=None):
         q, k_origin, v, v_new, v_new2, d, beta, initial_state = ctx.saved_tensors
         chunk_size = ctx.chunk_size

diff --git a/fla/ops/delta_rule/utils.py b/fla/ops/delta_rule/utils.py
@@ -4,10 +4,9 @@
 import triton
 import triton.language as tl
 from einops import rearrange
-from torch.cuda.amp import custom_bwd, custom_fwd
 
 from fla.ops.delta_rule.wy_fast import prepare_wy_repr as prepare_wy_repr2
-from fla.utils import contiguous
+from fla.utils import autocast_custom_bwd, autocast_custom_fwd, contiguous
 
 
 # Inspired by "THE WY REPRESENTATION FOR PRODUCTS OF HOUSEHOLDER MATRICES" https://epubs.siam.org/doi/pdf/10.1137/0908009
@@ -191,7 +190,7 @@ def bwd_prepare_wy_repr(k, v, beta, o_cumdecay, v_new, do, do2, chunk_size):
 
 class WYRepresentationPrepration(torch.autograd.Function):
     @contiguous
-    @custom_fwd
+    @autocast_custom_fwd
     @staticmethod
     def forward(ctx, k, v, beta, chunk_size):
         o_cumdecay, v_new = fwd_prepare_wy_repr(k, v, beta, chunk_size)
@@ -200,7 +199,7 @@ def forward(ctx, k, v, beta, chunk_size):
         return o_cumdecay, v_new
 
     @contiguous
-    @custom_bwd
+    @autocast_custom_bwd
     @staticmethod
     def backward(ctx, do, do2):
         k, v, beta, o_cumdecay, v_new = ctx.saved_tensors

diff --git a/fla/ops/delta_rule/wy_fast.py b/fla/ops/delta_rule/wy_fast.py
@@ -4,9 +4,8 @@
 import triton
 import triton.language as tl
 from einops import rearrange
-from torch.cuda.amp import custom_bwd, custom_fwd
 
-from fla.utils import contiguous
+from fla.utils import autocast_custom_bwd, autocast_custom_fwd, contiguous
 
 
 # Inspired by "THE WY REPRESENTATION FOR PRODUCTS OF HOUSEHOLDER MATRICES" https://epubs.siam.org/doi/pdf/10.1137/0908009
@@ -288,7 +287,7 @@ class WYRepresentationPrepration(torch.autograd.Function):
 
     @staticmethod
     @contiguous
-    @custom_fwd
+    @autocast_custom_fwd
     def forward(ctx, k, v, beta, chunk_size=64):
         ctx.BT = chunk_size
         w, u, A = fwd_prepare_wy_repr(k, v, beta, ctx.BT)
@@ -297,7 +296,7 @@ def forward(ctx, k, v, beta, chunk_size=64):
 
     @staticmethod
     @contiguous
-    @custom_bwd
+    @autocast_custom_bwd
     def backward(ctx, dw, du):
         k, v, beta, A = ctx.saved_tensors
         BT = ctx.BT

diff --git a/fla/ops/gla/chunk_fuse.py b/fla/ops/gla/chunk_fuse.py
@@ -12,11 +12,10 @@
 import triton.language as tl
 from einops import rearrange
 from packaging import version
-from torch.cuda.amp import custom_bwd, custom_fwd
 
 from fla.ops.gla.chunk_util import (bwd_decay_global_cumsum, fwd_decay_cumsum,
                                     prepare_qg_kg)
-from fla.utils import contiguous
+from fla.utils import autocast_custom_bwd, autocast_custom_fwd, contiguous
 
 
 @triton.jit
@@ -304,7 +303,7 @@ class FusedChunkGLAFunction(torch.autograd.Function):
 
     @staticmethod
     @contiguous
-    @custom_fwd
+    @autocast_custom_fwd
     def forward(ctx, q, k, v, g, scale, initial_state, output_final_state):
         ctx.g_dtype = g.dtype
         g_original = g
@@ -396,7 +395,7 @@ def forward(ctx, q, k, v, g, scale, initial_state, output_final_state):
 
     @staticmethod
     @contiguous
-    @custom_bwd
+    @autocast_custom_bwd
     def backward(ctx, do, dht=None):
         q, k, v, g_origin, A, initial_state = ctx.saved_tensors
         B, H, T, K, V = *k.shape, v.shape[-1]

diff --git a/fla/ops/gla/recurrent_fuse.py b/fla/ops/gla/recurrent_fuse.py
@@ -7,9 +7,8 @@
 import torch
 import triton
 import triton.language as tl
-from torch.cuda.amp import custom_bwd, custom_fwd
 
-from fla.utils import contiguous
+from fla.utils import autocast_custom_bwd, autocast_custom_fwd, contiguous
 
 # on-the-fly computation without materializing hidden statets into HBMs
 
@@ -223,7 +222,7 @@ class FusedRecurrentGLAFunction(torch.autograd.Function):
 
     @staticmethod
     @contiguous
-    @custom_fwd
+    @autocast_custom_fwd
     def forward(ctx, q, k, v, gk, gv, scale=None, initial_state=None, output_final_state=False, reverse=False):
         B, H, T, K, V = *q.shape, v.shape[-1]
         # default scale
@@ -270,7 +269,7 @@ def forward(ctx, q, k, v, gk, gv, scale=None, initial_state=None, output_final_s
 
     @staticmethod
     @contiguous
-    @custom_bwd
+    @autocast_custom_bwd
     def backward(ctx, do, dht=None):
         q, k, v, gk, gv, initial_state, o = ctx.saved_tensors
         batch_size, n_heads, seq_len, K = q.shape

diff --git a/fla/ops/linear_attn/chunk.py b/fla/ops/linear_attn/chunk.py
@@ -6,10 +6,9 @@
 import torch
 import triton
 import triton.language as tl
-from torch.cuda.amp import custom_bwd, custom_fwd
 
 from fla.ops.linear_attn.utils import normalize_output
-from fla.utils import contiguous
+from fla.utils import autocast_custom_bwd, autocast_custom_fwd, contiguous
 
 
 @triton.jit
@@ -238,7 +237,7 @@ class ChunkLinearAttentionFunction(torch.autograd.Function):
 
     @staticmethod
     @contiguous
-    @custom_fwd
+    @autocast_custom_fwd
     def forward(ctx, q, k, v, scale, initial_state, output_final_state):
         B, H, T, K, V = *q.shape, v.shape[-1]
         BT = 64
@@ -282,7 +281,7 @@ def forward(ctx, q, k, v, scale, initial_state, output_final_state):
 
     @staticmethod
     @contiguous
-    @custom_bwd
+    @autocast_custom_bwd
     def backward(ctx, do, dht=None):
         q, k, v, h = ctx.saved_tensors
 

diff --git a/fla/ops/linear_attn/chunk_fuse.py b/fla/ops/linear_attn/chunk_fuse.py
@@ -7,10 +7,9 @@
 import triton
 import triton.language as tl
 from packaging import version
-from torch.cuda.amp import custom_bwd, custom_fwd
 
 from fla.ops.linear_attn.utils import normalize_output
-from fla.utils import contiguous
+from fla.utils import autocast_custom_bwd, autocast_custom_fwd, contiguous
 
 
 @triton.jit
@@ -208,7 +207,7 @@ class FusedChunkLinearAttentionFunction(torch.autograd.Function):
 
     @staticmethod
     @contiguous
-    @custom_fwd
+    @autocast_custom_fwd
     def forward(ctx, q, k, v, scale, initial_state, output_final_state):
         B, H, T, K, V = *k.shape, v.shape[-1]
         BT = 64
@@ -255,7 +254,7 @@ def forward(ctx, q, k, v, scale, initial_state, output_final_state):
 
     @staticmethod
     @contiguous
-    @custom_bwd
+    @autocast_custom_bwd
     def backward(ctx, do, dht=None):
         q, k, v, initial_state = ctx.saved_tensors
         B, H, T, K, V = *k.shape, v.shape[-1]

diff --git a/fla/ops/rebased/parallel.py b/fla/ops/rebased/parallel.py
@@ -4,9 +4,8 @@
 import torch
 import triton
 import triton.language as tl
-from torch.cuda.amp import custom_bwd, custom_fwd
 
-from fla.utils import contiguous
+from fla.utils import autocast_custom_bwd, autocast_custom_fwd, contiguous
 
 # Rebased: Linear Transformers with Learnable Kernel Functions are Better In-Context Models
 # https://github.com/corl-team/rebased/blob/main/flash_linear_attention/fla/ops/triton/rebased_fast/parallel.py
@@ -339,7 +338,7 @@ class ParallelBasedFunction(torch.autograd.Function):
 
     @staticmethod
     @contiguous
-    @custom_fwd
+    @autocast_custom_fwd
     def forward(ctx, q, k, v, scale):
         BTL, BTS = 128, 32
         assert BTL % BTS == 0
@@ -374,7 +373,7 @@ def forward(ctx, q, k, v, scale):
 
     @staticmethod
     @contiguous
-    @custom_bwd
+    @autocast_custom_bwd
     def backward(ctx, do, dz):
         q, k, v = ctx.saved_tensors
         scale = ctx.scale

diff --git a/fla/ops/retention/chunk.py b/fla/ops/retention/chunk.py
@@ -6,9 +6,8 @@
 import torch
 import triton
 import triton.language as tl
-from torch.cuda.amp import custom_bwd, custom_fwd
 
-from fla.utils import contiguous
+from fla.utils import autocast_custom_bwd, autocast_custom_fwd, contiguous
 
 
 @triton.autotune(
@@ -375,7 +374,7 @@ class ChunkRetentionFunction(torch.autograd.Function):
 
     @staticmethod
     @contiguous
-    @custom_fwd
+    @autocast_custom_fwd
     def forward(ctx, q, k, v, initial_state, output_final_state, scale, checkpoint_level):
         BT = 64
         h, final_state = chunk_fwd_h_fn(k, v, BT, initial_state, output_final_state)
@@ -388,7 +387,7 @@ def forward(ctx, q, k, v, initial_state, output_final_state, scale, checkpoint_l
 
     @staticmethod
     @contiguous
-    @custom_bwd
+    @autocast_custom_bwd
     def backward(ctx, do, d_ht=None):
         BT, scale = ctx.BT, ctx.scale
         q, k, v, h, initial_state = ctx.saved_tensors