[mxfp8 moe training] refactor all var names with suffix _mx to _fp8 for clarity

danielvegamyhre · danielvegamyhre · commit 36e6bc796b54 · 2025-08-25T18:19:39.000-07:00
diff --git a/torchao/prototype/moe_training/scaled_grouped_mm.py b/torchao/prototype/moe_training/scaled_grouped_mm.py
@@ -291,13 +291,13 @@ def forward(
         ctx.out_dtype = out_dtype
         ctx.emulated = emulated
 
-        # A_mx shape: (M, K)
+        # A_fp8 shape: (M, K)
         # A_scale shape: (M, K//block_size)
-        A_scale, A_mx = to_mx(A, elem_dtype=torch.float8_e4m3fn, block_size=block_size)
+        A_scale, A_fp8 = to_mx(A, elem_dtype=torch.float8_e4m3fn, block_size=block_size)
 
-        # B_mx shape: (E, N, K)
+        # B_fp8 shape: (E, N, K)
         # B_scale shape: (E, N, K//block_size)
-        B_scales, B_mx = to_mx(
+        B_scales, B_fp8 = to_mx(
             B_t.transpose(-2, -1),
             elem_dtype=torch.float8_e4m3fn,
             block_size=block_size,
@@ -311,9 +311,9 @@ def forward(
             else fbgemm_mxfp8_grouped_mm_2d_3d
         )
         out = mxfp8_2d_3d_grouped_mm(
-            A_mx,
+            A_fp8,
             A_scale,
-            B_mx,
+            B_fp8,
             B_scales,
             offs=offs,
             block_size=block_size,
@@ -328,15 +328,15 @@ def backward(ctx, grad_out: torch.Tensor):
         out_dtype = ctx.out_dtype
         emulated = ctx.emulated
 
-        # grad_out_mx shape: (M, N)
+        # grad_out_fp8 shape: (M, N)
         # grad_out_scale shape: (M, N//block_size)
-        grad_out_scale, grad_out_mx = to_mx(
+        grad_out_scale, grad_out_fp8 = to_mx(
             grad_out, elem_dtype=torch.float8_e4m3fn, block_size=block_size
         )
 
-        # B_mx shape: (E, K, N)
+        # B_fp8 shape: (E, K, N)
         # B_scale shape: (E, K, N//block_size)
-        B_scales, B_mx = to_mx(
+        B_scales, B_fp8 = to_mx(
             # TODO: can we support non-contiguous input tensor in to_mx to eliminate this inefficiency?
             B_t.contiguous(),
             elem_dtype=torch.float8_e4m3fn,
@@ -350,43 +350,43 @@ def backward(ctx, grad_out: torch.Tensor):
             else fbgemm_mxfp8_grouped_mm_2d_3d
         )
         grad_A = mxfp8_2d_3d_grouped_mm(
-            grad_out_mx,
+            grad_out_fp8,
             grad_out_scale,
-            B_mx,
+            B_fp8,
             B_scales,
             offs=offs,
             out_dtype=out_dtype,
         )
 
-        # grad_out_t_mx shape: (N, M)
+        # grad_out_t_fp8 shape: (N, M)
         # grad_out_t_scales shape: (N, M//block_size)
-        grad_out_t_scales, grad_out_t_mx = to_mx(
+        grad_out_t_scales, grad_out_t_fp8 = to_mx(
             # TODO: can we support non-contiguous input tensor in to_mx to eliminate this inefficiency?
             grad_out.transpose(-2, -1).contiguous(),
             elem_dtype=torch.float8_e4m3fn,
             block_size=block_size,
         )
 
         # Transpose A so we can scale along the M dimension, then un-transpose.
-        # A_t_mx shape: (K, M)
+        # A_t_fp8 shape: (K, M)
         # A_t_scales shape: (K, M//block_size)
-        A_t_scales, A_t_mx = to_mx(
+        A_t_scales, A_t_fp8 = to_mx(
             A.transpose(-2, -1).contiguous(),
             elem_dtype=torch.float8_e4m3fn,
             block_size=block_size,
         )
 
-        # A_mx shape = (M, K)
-        A_mx = A_t_mx.transpose(-2, -1)
+        # A_fp8 shape = (M, K)
+        A_fp8 = A_t_fp8.transpose(-2, -1)
 
         # A_scales shape = (M//block_size, K)
         A_scales = A_t_scales.transpose(-2, -1)
 
         # grad_B_t = scaled grouped mm of (N,M) @ (M,K) = (E,N,K)
         grad_B = _emulated_mxfp8_scaled_grouped_mm_2d_2d(
-            grad_out_t_mx,
+            grad_out_t_fp8,
             grad_out_t_scales,
-            A_mx,
+            A_fp8,
             A_scales,
             offs=offs,
         )
@@ -398,64 +398,64 @@ def backward(ctx, grad_out: torch.Tensor):
 
 
 def _emulated_mxfp8_scaled_grouped_mm_2d_3d(
-    A_mx: torch.Tensor,
+    A_fp8: torch.Tensor,
     A_scale: torch.Tensor,
-    B_mx: torch.Tensor,
+    B_fp8: torch.Tensor,
     B_scale: torch.Tensor,
     offs: Optional[torch.Tensor] = None,
     out_dtype: Optional[torch.dtype] = torch.bfloat16,
     block_size: int = 32,
 ) -> torch.Tensor:
-    assert A_mx.ndim == 2, f"A must be 2D, got {A_mx.ndim}"
-    assert B_mx.ndim == 3, f"B must be 3D, got {B_mx.ndim}"
-    assert A_scale.shape[0] == A_mx.shape[0], (
-        f"A_scale must have same M dim as A_mx, got A={A_mx.shape} and A_scale={A_scale.shape}"
+    assert A_fp8.ndim == 2, f"A must be 2D, got {A_fp8.ndim}"
+    assert B_fp8.ndim == 3, f"B must be 3D, got {B_fp8.ndim}"
+    assert A_scale.shape[0] == A_fp8.shape[0], (
+        f"A_scale must have same M dim as A_fp8, got A={A_fp8.shape} and A_scale={A_scale.shape}"
     )
-    assert A_scale.shape[1] == A_mx.shape[1] // block_size, (
-        f"A_scale dim1 should be size K//block_size, got A={A_mx.shape} and A_scale={A_scale.shape}"
+    assert A_scale.shape[1] == A_fp8.shape[1] // block_size, (
+        f"A_scale dim1 should be size K//block_size, got A={A_fp8.shape} and A_scale={A_scale.shape}"
     )
-    assert B_scale.shape[0] == B_mx.shape[0], (
-        f"B_scale must have same E dim as B_mx, got B={B_mx.shape} and B_scale={B_scale.shape}"
+    assert B_scale.shape[0] == B_fp8.shape[0], (
+        f"B_scale must have same E dim as B_fp8, got B={B_fp8.shape} and B_scale={B_scale.shape}"
     )
-    assert B_scale.shape[1] == B_mx.shape[1], (
-        f"B_scale must have same N dim as B_mx, got B={B_mx.shape} and B_scale={B_scale.shape}"
+    assert B_scale.shape[1] == B_fp8.shape[1], (
+        f"B_scale must have same N dim as B_fp8, got B={B_fp8.shape} and B_scale={B_scale.shape}"
     )
-    assert B_scale.shape[2] == B_mx.shape[2] // block_size, (
-        f"B_scale dim2 should be size K//block_size, got B={B_mx.shape} and B_scale={B_scale.shape}"
+    assert B_scale.shape[2] == B_fp8.shape[2] // block_size, (
+        f"B_scale dim2 should be size K//block_size, got B={B_fp8.shape} and B_scale={B_scale.shape}"
     )
 
     # Dequantize input
-    # A_mx shape: (M, K)
+    # A_fp8 shape: (M, K)
     # A_scale shape: (M, K//block_size)
-    A_orig_shape = A_mx.shape
+    A_orig_shape = A_fp8.shape
 
     # Reshape to be able to do per-scaling group multiplication
-    # A_mx shape: (M, K//block_size, block_size)
+    # A_fp8 shape: (M, K//block_size, block_size)
     # A_scale shape: (M, K//block_size, 1)
-    A_mx = A_mx.reshape(*A_mx.shape[:-1], A_mx.shape[-1] // block_size, block_size)
+    A_fp8 = A_fp8.reshape(*A_fp8.shape[:-1], A_fp8.shape[-1] // block_size, block_size)
     A_scale = A_scale.unsqueeze(-1)
 
     # Rescale and cast to bfloat16
-    A = A_mx.to(torch.bfloat16) * A_scale.to(torch.bfloat16)
+    A = A_fp8.to(torch.bfloat16) * A_scale.to(torch.bfloat16)
 
     # Reshape back to original shape
     # A shape: (M, K)
     A = A.reshape(A_orig_shape)
 
     # Dequantize weights
     # Tranpose to get block_size on rightmost dim
-    # B_mx shape: (E, N, K)
+    # B_fp8 shape: (E, N, K)
     # B_scale shape: (E, N, K//block_size)
-    E, N, K = B_mx.shape
+    E, N, K = B_fp8.shape
 
     # Reshape to be able to do per-scaling group multiplication
-    # B_mx shape: (E, N, K//block_size, block_size)
+    # B_fp8 shape: (E, N, K//block_size, block_size)
     # B_scale shape: (E, N, K//block_size, 1)
-    B_mx = B_mx.reshape(*B_mx.shape[:-1], B_mx.shape[-1] // block_size, block_size)
+    B_fp8 = B_fp8.reshape(*B_fp8.shape[:-1], B_fp8.shape[-1] // block_size, block_size)
     B_scale = B_scale.unsqueeze(-1)
 
     # Rescale and cast to bfloat16
-    B = B_mx.to(torch.bfloat16) * B_scale.to(torch.bfloat16)
+    B = B_fp8.to(torch.bfloat16) * B_scale.to(torch.bfloat16)
 
     # Reshape back to original shape
     # B shape: (E, K, N)
@@ -467,27 +467,27 @@ def _emulated_mxfp8_scaled_grouped_mm_2d_3d(
 
 
 def _emulated_mxfp8_scaled_grouped_mm_2d_2d(
-    A_mx: torch.Tensor,  # (M, K)
+    A_fp8: torch.Tensor,  # (M, K)
     A_scale: torch.Tensor,  # (M, K//block_size)
-    B_mx: torch.Tensor,  # (K, N)
+    B_fp8: torch.Tensor,  # (K, N)
     B_scale: torch.Tensor,  # (K//block_size, N)
     offs: torch.Tensor,
     out_dtype: Optional[torch.dtype] = torch.bfloat16,
     block_size: int = 32,
 ) -> torch.Tensor:
-    assert A_mx.ndim == 2, "A must be 2D"
-    assert B_mx.ndim == 2, "B must be 2D"
+    assert A_fp8.ndim == 2, "A must be 2D"
+    assert B_fp8.ndim == 2, "B must be 2D"
     A = torch.zeros(
-        A_mx.shape,
+        A_fp8.shape,
         dtype=torch.bfloat16,
-        device=A_mx.device,
-        requires_grad=A_mx.requires_grad,
+        device=A_fp8.device,
+        requires_grad=A_fp8.requires_grad,
     )
     B = torch.zeros(
-        B_mx.shape,
+        B_fp8.shape,
         dtype=torch.bfloat16,
-        device=B_mx.device,
-        requires_grad=B_mx.requires_grad,
+        device=B_fp8.device,
+        requires_grad=B_fp8.requires_grad,
     )
 
     # Dequantize input per each scaling group
@@ -503,7 +503,7 @@ def _emulated_mxfp8_scaled_grouped_mm_2d_2d(
         # -- Dequantize A tensor
         # A_group shape: (M, group_size)
         # A_scale shape: (M, group_size//block_size)
-        A_group = A_mx[:, group_start_idx:group_end_idx]
+        A_group = A_fp8[:, group_start_idx:group_end_idx]
         A_group_shape = A_group.shape
 
         # Get scales for this group.
@@ -528,7 +528,7 @@ def _emulated_mxfp8_scaled_grouped_mm_2d_2d(
 
         # -- Dequantize B tensor
         # B_group shape is (group_size, N)
-        B_group = B_mx[group_start_idx:group_end_idx, :]
+        B_group = B_fp8[group_start_idx:group_end_idx, :]
         B_group_shape = B_group.shape
 
         # Scales shape is (group_size//block_size, N)