v4.4 tag release update. (#3032)

2026-05-26 08:00:54 +00:00 · 2026-02-14 12:27:58 +08:00
parent 01687cfba1
commit d4bbf728ca
140 changed files with 41624 additions and 3691 deletions
--- a/examples/python/CuTeDSL/experimental/blackwell/dense_block_scaled_gemm.py
+++ b/examples/python/CuTeDSL/experimental/blackwell/dense_block_scaled_gemm.py
@@ -213,12 +213,10 @@ class BlockScaledDenseGemmKernel:
        )

        # UMMA ACC TMEM Layout
-        # ((MMA_M, MMA_N), REST_MMA_M, REST_MMA_N)
-        acc_shape = tiled_mma.partition_shape_C(mma_tiler_mnk[:2])
        # ((MMA_M, MMA_N), REST_MMA_M, REST_MMA_N, ACC_STAGES)
-        tmem_accs_layout = tiled_mma.make_fragment_C(
-            cute.append(acc_shape, self.num_acc_stages)
-        ).layout
+        tmem_accs_layout = cute_ext.make_tmem_layout_acc(
+            tiled_mma, mma_tiler_mnk, self.num_acc_stages
+        )

        sfa_tmem_layout = blockscaled_utils.make_tmem_layout_sfa(
            tiled_mma,
@@ -318,21 +316,17 @@ class BlockScaledDenseGemmKernel:
            self.use_2cta_instrs,
        )

-        # Performing layout calculations for one stage, in order to anticipate the
-        # required RMEM per thread and for reading from TMEM, and writing into SMEM
-        # tmem_acc: (MMA_M, MMA_N, MMA_REST_M, MMA_REST_N)
-        tmem_acc = tiled_mma.make_fragment_C(acc_shape)
-        # tmem_acc_epi: (EPI_TILE_M, EPI_TILE_N, EPI_REST_M, EPI_REST_N)
-        tmem_acc_epi = cute.flat_divide(tmem_acc[((None, None), 0, 0)], epi_tile)
-        tiled_copy_t2r = tcgen05.make_tmem_copy(
-            copy_atom_t2r, tmem_acc_epi[(None, None, 0, 0)]
-        )
+        # Derive tiled_copy_t2r from the allocated TMEM buffer
+        accumulators = cute.zipped_divide(buffer_tmem_accs, ((epi_tile), 1))
+        acc_epi_div = accumulators[((None, None), 0), 0]
+        tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, acc_epi_div)
        thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)

-        # gC_tile_epi: (EPI_TILE_M, EPI_TILE_N, EPI_REST_M, EPI_REST_N)
+        # Derive per-thread RMEM layout for the T2R epilogue copy
        gC_tile_epi = cute.flat_divide(gC_tile, epi_tile)
-        t2r_rmem_epi = thr_copy_t2r.partition_D(gC_tile_epi[(None, None, 0, 0)])
-        acc_epi_rmem_layout = cute.make_fragment_like(t2r_rmem_epi.layout)
+        acc_epi_rmem_layout = cute_ext.make_t2r_rmem_layout(
+            tiled_copy_t2r, gC_tile_epi, tidx
+        )

        # Allocate RMEM buffers
        buffer_rmem_t2r = cute_ext.allocate(
--- a/examples/python/CuTeDSL/experimental/blackwell/dense_gemm.py
+++ b/examples/python/CuTeDSL/experimental/blackwell/dense_gemm.py
@@ -463,18 +463,9 @@ class DenseGemmKernel:
        # - Has a capacity limit of 512 columns
        # - Requires specific layout patterns matching MMA instructions
        #
-        # partition_shape_C: Computes the accumulator shape based on MMA configuration.
-        # This returns the shape needed to store C = A × B results.
-        #
-        # cute.append(shape, stage): Appends a dimension for staging.
-        # For acc_stage=2: shape becomes (..., 2) for double-buffering.
-        #
-        # make_fragment_C: Creates a tensor descriptor with the appropriate layout
-        # for MMA accumulator storage. The .layout attribute extracts just the layout.
-        acc_shape = tiled_mma.partition_shape_C(mnk_tiler[:2])  # (M_tile, N_tile)
-        tmem_layout = tiled_mma.make_fragment_C(
-            cute.append(acc_shape, acc_stage)  # Add stage dimension
-        ).layout
+        # make_tmem_layout_acc: Derives the TMEM accumulator buffer layout from the
+        # tiled MMA and MNK tiler, with the given number of pipeline stages.
+        tmem_layout = cute_ext.make_tmem_layout_acc(tiled_mma, mnk_tiler, acc_stage)

        # ========================================================================================
        # STEP 10: ALLOCATE SMEM BUFFERS
@@ -583,9 +574,6 @@ class DenseGemmKernel:
        # We derive the RMEM layout by partitioning the destination and extracting
        # the per-thread layout.
        #
-        # get_slice(tid_x): Gets the per-thread view of the tiled copy.
-        # partition_D: Partitions the destination tensor according to the copy layout.
-        #
        # CUTE ALGEBRA EXPLANATION - flat_divide:
        # ---------------------------------------
        # flat_divide(tensor, tiler) flattens all dimensions:
@@ -594,20 +582,11 @@ class DenseGemmKernel:
        # Unlike zipped_divide which groups tile and rest separately,
        # flat_divide keeps everything flat, which is useful for iteration.
        #
-        # For epilogue: gC_mnl_epi = cute.flat_divide(gD_tile, epi_tile)
-        # This creates a flat view where we can iterate over sub-tiles with indices.
-        thr_copy_t2r = tiled_copy_t2r.get_slice(tid_x)
+        # make_t2r_rmem_layout: Derives the per-thread RMEM buffer layout
+        # produced by a TMEM->RMEM copy for a single epilogue iteration.
        gC_mnl_epi = cute.flat_divide(gD_tile, epi_tile)
-
-        # Partition the output tensor according to the copy layout.
-        # tTR_gC has the thread's view of the output.
-        tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
-
-        # make_fragment_like: Creates a layout matching a given tensor's layout.
-        # This is the standard way to derive RMEM layouts from copy partitions.
-        # The slicing [(None, None, None, 0, 0)] extracts one sub-tile's layout.
-        acc_d_rmem_layout = cute.make_fragment_like(
-            tTR_gC[(None, None, None, 0, 0)].layout
+        acc_d_rmem_layout = cute_ext.make_t2r_rmem_layout(
+            tiled_copy_t2r, gC_mnl_epi, tid_x
        )

        # ========================================================================================
--- a/examples/python/CuTeDSL/experimental/blackwell/dense_gemm_2sm.py
+++ b/examples/python/CuTeDSL/experimental/blackwell/dense_gemm_2sm.py
@@ -200,10 +200,9 @@ def sm100_4x4x1_kernel_builder(
            TMA_STORE_PIPE_DEPTH,
        )

-        acc_shape = tiled_mma.partition_shape_C(mnk_tiler[:2])
-        tmem_layout = tiled_mma.make_fragment_C(
-            cute.append(acc_shape, EPILOGUE_STAGE_DEPTH)
-        ).layout
+        tmem_layout = cute_ext.make_tmem_layout_acc(
+            tiled_mma, mnk_tiler, EPILOGUE_STAGE_DEPTH
+        )

        bufferA = cute_ext.allocate(
            ab_dtype,
@@ -251,11 +250,9 @@ def sm100_4x4x1_kernel_builder(
        tiled_copy_t2r = cute.nvgpu.tcgen05.make_tmem_copy(copy_atom_t2r, acc_epi_div)

        # Calculate the per thread destination size per iteration for output of TMEM and input of SMEM
-        thr_copy_t2r = tiled_copy_t2r.get_slice(tid_x)
        gC_mnl_epi = cute.flat_divide(tDgD, epi_tile)
-        tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
-        acc_d_rmem_layout = cute.make_fragment_like(
-            tTR_gC[(None, None, None, 0, 0)].layout
+        acc_d_rmem_layout = cute_ext.make_t2r_rmem_layout(
+            tiled_copy_t2r, gC_mnl_epi, tid_x
        )

        bufferRAcc = cute_ext.allocate(
--- a/examples/python/CuTeDSL/experimental/blackwell/dense_gemm_cute_pipeline.py
+++ b/examples/python/CuTeDSL/experimental/blackwell/dense_gemm_cute_pipeline.py
--- a/examples/python/CuTeDSL/experimental/blackwell/dense_gemm_ptr_array.py
+++ b/examples/python/CuTeDSL/experimental/blackwell/dense_gemm_ptr_array.py
@@ -198,10 +198,7 @@ class DenseGemmPtrArrayKernel:
        )

        # UMMA ACC TMEM Layout
-        acc_shape = tiled_mma.partition_shape_C(mnk_tiler[:2])
-        tmem_layout = tiled_mma.make_fragment_C(
-            cute.append(acc_shape, acc_stage)
-        ).layout
+        tmem_layout = cute_ext.make_tmem_layout_acc(tiled_mma, mnk_tiler, acc_stage)

        # Allocate UMMA Buffers
        bufferA = cute_ext.allocate(
@@ -251,11 +248,9 @@ class DenseGemmPtrArrayKernel:
        tiled_copy_t2r = cute.nvgpu.tcgen05.make_tmem_copy(copy_atom_t2r, acc_epi_div)

        # Calculate the per thread destination size per iteration for output of TMEM and input of SMEM
-        thr_copy_t2r = tiled_copy_t2r.get_slice(tid_x)
        gC_mnl_epi = cute.flat_divide(gD_tile, epi_tile)
-        tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
-        acc_d_rmem_layout = cute.make_fragment_like(
-            tTR_gC[(None, None, None, 0, 0)].layout
+        acc_d_rmem_layout = cute_ext.make_t2r_rmem_layout(
+            tiled_copy_t2r, gC_mnl_epi, tid_x
        )

        # Allocate RMEM buffers