v4.4 tag release update. (#3032)

This commit is contained in:
Junkai-Wu
2026-02-14 12:27:58 +08:00
committed by GitHub
parent 01687cfba1
commit d4bbf728ca
140 changed files with 41624 additions and 3691 deletions

View File

@@ -213,12 +213,10 @@ class BlockScaledDenseGemmKernel:
)
# UMMA ACC TMEM Layout
# ((MMA_M, MMA_N), REST_MMA_M, REST_MMA_N)
acc_shape = tiled_mma.partition_shape_C(mma_tiler_mnk[:2])
# ((MMA_M, MMA_N), REST_MMA_M, REST_MMA_N, ACC_STAGES)
tmem_accs_layout = tiled_mma.make_fragment_C(
cute.append(acc_shape, self.num_acc_stages)
).layout
tmem_accs_layout = cute_ext.make_tmem_layout_acc(
tiled_mma, mma_tiler_mnk, self.num_acc_stages
)
sfa_tmem_layout = blockscaled_utils.make_tmem_layout_sfa(
tiled_mma,
@@ -318,21 +316,17 @@ class BlockScaledDenseGemmKernel:
self.use_2cta_instrs,
)
# Performing layout calculations for one stage, in order to anticipate the
# required RMEM per thread and for reading from TMEM, and writing into SMEM
# tmem_acc: (MMA_M, MMA_N, MMA_REST_M, MMA_REST_N)
tmem_acc = tiled_mma.make_fragment_C(acc_shape)
# tmem_acc_epi: (EPI_TILE_M, EPI_TILE_N, EPI_REST_M, EPI_REST_N)
tmem_acc_epi = cute.flat_divide(tmem_acc[((None, None), 0, 0)], epi_tile)
tiled_copy_t2r = tcgen05.make_tmem_copy(
copy_atom_t2r, tmem_acc_epi[(None, None, 0, 0)]
)
# Derive tiled_copy_t2r from the allocated TMEM buffer
accumulators = cute.zipped_divide(buffer_tmem_accs, ((epi_tile), 1))
acc_epi_div = accumulators[((None, None), 0), 0]
tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, acc_epi_div)
thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
# gC_tile_epi: (EPI_TILE_M, EPI_TILE_N, EPI_REST_M, EPI_REST_N)
# Derive per-thread RMEM layout for the T2R epilogue copy
gC_tile_epi = cute.flat_divide(gC_tile, epi_tile)
t2r_rmem_epi = thr_copy_t2r.partition_D(gC_tile_epi[(None, None, 0, 0)])
acc_epi_rmem_layout = cute.make_fragment_like(t2r_rmem_epi.layout)
acc_epi_rmem_layout = cute_ext.make_t2r_rmem_layout(
tiled_copy_t2r, gC_tile_epi, tidx
)
# Allocate RMEM buffers
buffer_rmem_t2r = cute_ext.allocate(

View File

@@ -463,18 +463,9 @@ class DenseGemmKernel:
# - Has a capacity limit of 512 columns
# - Requires specific layout patterns matching MMA instructions
#
# partition_shape_C: Computes the accumulator shape based on MMA configuration.
# This returns the shape needed to store C = A × B results.
#
# cute.append(shape, stage): Appends a dimension for staging.
# For acc_stage=2: shape becomes (..., 2) for double-buffering.
#
# make_fragment_C: Creates a tensor descriptor with the appropriate layout
# for MMA accumulator storage. The .layout attribute extracts just the layout.
acc_shape = tiled_mma.partition_shape_C(mnk_tiler[:2]) # (M_tile, N_tile)
tmem_layout = tiled_mma.make_fragment_C(
cute.append(acc_shape, acc_stage) # Add stage dimension
).layout
# make_tmem_layout_acc: Derives the TMEM accumulator buffer layout from the
# tiled MMA and MNK tiler, with the given number of pipeline stages.
tmem_layout = cute_ext.make_tmem_layout_acc(tiled_mma, mnk_tiler, acc_stage)
# ========================================================================================
# STEP 10: ALLOCATE SMEM BUFFERS
@@ -583,9 +574,6 @@ class DenseGemmKernel:
# We derive the RMEM layout by partitioning the destination and extracting
# the per-thread layout.
#
# get_slice(tid_x): Gets the per-thread view of the tiled copy.
# partition_D: Partitions the destination tensor according to the copy layout.
#
# CUTE ALGEBRA EXPLANATION - flat_divide:
# ---------------------------------------
# flat_divide(tensor, tiler) flattens all dimensions:
@@ -594,20 +582,11 @@ class DenseGemmKernel:
# Unlike zipped_divide which groups tile and rest separately,
# flat_divide keeps everything flat, which is useful for iteration.
#
# For epilogue: gC_mnl_epi = cute.flat_divide(gD_tile, epi_tile)
# This creates a flat view where we can iterate over sub-tiles with indices.
thr_copy_t2r = tiled_copy_t2r.get_slice(tid_x)
# make_t2r_rmem_layout: Derives the per-thread RMEM buffer layout
# produced by a TMEM->RMEM copy for a single epilogue iteration.
gC_mnl_epi = cute.flat_divide(gD_tile, epi_tile)
# Partition the output tensor according to the copy layout.
# tTR_gC has the thread's view of the output.
tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
# make_fragment_like: Creates a layout matching a given tensor's layout.
# This is the standard way to derive RMEM layouts from copy partitions.
# The slicing [(None, None, None, 0, 0)] extracts one sub-tile's layout.
acc_d_rmem_layout = cute.make_fragment_like(
tTR_gC[(None, None, None, 0, 0)].layout
acc_d_rmem_layout = cute_ext.make_t2r_rmem_layout(
tiled_copy_t2r, gC_mnl_epi, tid_x
)
# ========================================================================================

View File

@@ -200,10 +200,9 @@ def sm100_4x4x1_kernel_builder(
TMA_STORE_PIPE_DEPTH,
)
acc_shape = tiled_mma.partition_shape_C(mnk_tiler[:2])
tmem_layout = tiled_mma.make_fragment_C(
cute.append(acc_shape, EPILOGUE_STAGE_DEPTH)
).layout
tmem_layout = cute_ext.make_tmem_layout_acc(
tiled_mma, mnk_tiler, EPILOGUE_STAGE_DEPTH
)
bufferA = cute_ext.allocate(
ab_dtype,
@@ -251,11 +250,9 @@ def sm100_4x4x1_kernel_builder(
tiled_copy_t2r = cute.nvgpu.tcgen05.make_tmem_copy(copy_atom_t2r, acc_epi_div)
# Calculate the per thread destination size per iteration for output of TMEM and input of SMEM
thr_copy_t2r = tiled_copy_t2r.get_slice(tid_x)
gC_mnl_epi = cute.flat_divide(tDgD, epi_tile)
tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
acc_d_rmem_layout = cute.make_fragment_like(
tTR_gC[(None, None, None, 0, 0)].layout
acc_d_rmem_layout = cute_ext.make_t2r_rmem_layout(
tiled_copy_t2r, gC_mnl_epi, tid_x
)
bufferRAcc = cute_ext.allocate(

View File

@@ -198,10 +198,7 @@ class DenseGemmPtrArrayKernel:
)
# UMMA ACC TMEM Layout
acc_shape = tiled_mma.partition_shape_C(mnk_tiler[:2])
tmem_layout = tiled_mma.make_fragment_C(
cute.append(acc_shape, acc_stage)
).layout
tmem_layout = cute_ext.make_tmem_layout_acc(tiled_mma, mnk_tiler, acc_stage)
# Allocate UMMA Buffers
bufferA = cute_ext.allocate(
@@ -251,11 +248,9 @@ class DenseGemmPtrArrayKernel:
tiled_copy_t2r = cute.nvgpu.tcgen05.make_tmem_copy(copy_atom_t2r, acc_epi_div)
# Calculate the per thread destination size per iteration for output of TMEM and input of SMEM
thr_copy_t2r = tiled_copy_t2r.get_slice(tid_x)
gC_mnl_epi = cute.flat_divide(gD_tile, epi_tile)
tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
acc_d_rmem_layout = cute.make_fragment_like(
tTR_gC[(None, None, None, 0, 0)].layout
acc_d_rmem_layout = cute_ext.make_t2r_rmem_layout(
tiled_copy_t2r, gC_mnl_epi, tid_x
)
# Allocate RMEM buffers