mirror of
https://github.com/NVIDIA/cutlass.git
synced 2026-05-26 08:00:54 +00:00
v4.4 tag release update. (#3032)
This commit is contained in:
@@ -213,12 +213,10 @@ class BlockScaledDenseGemmKernel:
|
||||
)
|
||||
|
||||
# UMMA ACC TMEM Layout
|
||||
# ((MMA_M, MMA_N), REST_MMA_M, REST_MMA_N)
|
||||
acc_shape = tiled_mma.partition_shape_C(mma_tiler_mnk[:2])
|
||||
# ((MMA_M, MMA_N), REST_MMA_M, REST_MMA_N, ACC_STAGES)
|
||||
tmem_accs_layout = tiled_mma.make_fragment_C(
|
||||
cute.append(acc_shape, self.num_acc_stages)
|
||||
).layout
|
||||
tmem_accs_layout = cute_ext.make_tmem_layout_acc(
|
||||
tiled_mma, mma_tiler_mnk, self.num_acc_stages
|
||||
)
|
||||
|
||||
sfa_tmem_layout = blockscaled_utils.make_tmem_layout_sfa(
|
||||
tiled_mma,
|
||||
@@ -318,21 +316,17 @@ class BlockScaledDenseGemmKernel:
|
||||
self.use_2cta_instrs,
|
||||
)
|
||||
|
||||
# Performing layout calculations for one stage, in order to anticipate the
|
||||
# required RMEM per thread and for reading from TMEM, and writing into SMEM
|
||||
# tmem_acc: (MMA_M, MMA_N, MMA_REST_M, MMA_REST_N)
|
||||
tmem_acc = tiled_mma.make_fragment_C(acc_shape)
|
||||
# tmem_acc_epi: (EPI_TILE_M, EPI_TILE_N, EPI_REST_M, EPI_REST_N)
|
||||
tmem_acc_epi = cute.flat_divide(tmem_acc[((None, None), 0, 0)], epi_tile)
|
||||
tiled_copy_t2r = tcgen05.make_tmem_copy(
|
||||
copy_atom_t2r, tmem_acc_epi[(None, None, 0, 0)]
|
||||
)
|
||||
# Derive tiled_copy_t2r from the allocated TMEM buffer
|
||||
accumulators = cute.zipped_divide(buffer_tmem_accs, ((epi_tile), 1))
|
||||
acc_epi_div = accumulators[((None, None), 0), 0]
|
||||
tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, acc_epi_div)
|
||||
thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
|
||||
|
||||
# gC_tile_epi: (EPI_TILE_M, EPI_TILE_N, EPI_REST_M, EPI_REST_N)
|
||||
# Derive per-thread RMEM layout for the T2R epilogue copy
|
||||
gC_tile_epi = cute.flat_divide(gC_tile, epi_tile)
|
||||
t2r_rmem_epi = thr_copy_t2r.partition_D(gC_tile_epi[(None, None, 0, 0)])
|
||||
acc_epi_rmem_layout = cute.make_fragment_like(t2r_rmem_epi.layout)
|
||||
acc_epi_rmem_layout = cute_ext.make_t2r_rmem_layout(
|
||||
tiled_copy_t2r, gC_tile_epi, tidx
|
||||
)
|
||||
|
||||
# Allocate RMEM buffers
|
||||
buffer_rmem_t2r = cute_ext.allocate(
|
||||
|
||||
@@ -463,18 +463,9 @@ class DenseGemmKernel:
|
||||
# - Has a capacity limit of 512 columns
|
||||
# - Requires specific layout patterns matching MMA instructions
|
||||
#
|
||||
# partition_shape_C: Computes the accumulator shape based on MMA configuration.
|
||||
# This returns the shape needed to store C = A × B results.
|
||||
#
|
||||
# cute.append(shape, stage): Appends a dimension for staging.
|
||||
# For acc_stage=2: shape becomes (..., 2) for double-buffering.
|
||||
#
|
||||
# make_fragment_C: Creates a tensor descriptor with the appropriate layout
|
||||
# for MMA accumulator storage. The .layout attribute extracts just the layout.
|
||||
acc_shape = tiled_mma.partition_shape_C(mnk_tiler[:2]) # (M_tile, N_tile)
|
||||
tmem_layout = tiled_mma.make_fragment_C(
|
||||
cute.append(acc_shape, acc_stage) # Add stage dimension
|
||||
).layout
|
||||
# make_tmem_layout_acc: Derives the TMEM accumulator buffer layout from the
|
||||
# tiled MMA and MNK tiler, with the given number of pipeline stages.
|
||||
tmem_layout = cute_ext.make_tmem_layout_acc(tiled_mma, mnk_tiler, acc_stage)
|
||||
|
||||
# ========================================================================================
|
||||
# STEP 10: ALLOCATE SMEM BUFFERS
|
||||
@@ -583,9 +574,6 @@ class DenseGemmKernel:
|
||||
# We derive the RMEM layout by partitioning the destination and extracting
|
||||
# the per-thread layout.
|
||||
#
|
||||
# get_slice(tid_x): Gets the per-thread view of the tiled copy.
|
||||
# partition_D: Partitions the destination tensor according to the copy layout.
|
||||
#
|
||||
# CUTE ALGEBRA EXPLANATION - flat_divide:
|
||||
# ---------------------------------------
|
||||
# flat_divide(tensor, tiler) flattens all dimensions:
|
||||
@@ -594,20 +582,11 @@ class DenseGemmKernel:
|
||||
# Unlike zipped_divide which groups tile and rest separately,
|
||||
# flat_divide keeps everything flat, which is useful for iteration.
|
||||
#
|
||||
# For epilogue: gC_mnl_epi = cute.flat_divide(gD_tile, epi_tile)
|
||||
# This creates a flat view where we can iterate over sub-tiles with indices.
|
||||
thr_copy_t2r = tiled_copy_t2r.get_slice(tid_x)
|
||||
# make_t2r_rmem_layout: Derives the per-thread RMEM buffer layout
|
||||
# produced by a TMEM->RMEM copy for a single epilogue iteration.
|
||||
gC_mnl_epi = cute.flat_divide(gD_tile, epi_tile)
|
||||
|
||||
# Partition the output tensor according to the copy layout.
|
||||
# tTR_gC has the thread's view of the output.
|
||||
tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
|
||||
|
||||
# make_fragment_like: Creates a layout matching a given tensor's layout.
|
||||
# This is the standard way to derive RMEM layouts from copy partitions.
|
||||
# The slicing [(None, None, None, 0, 0)] extracts one sub-tile's layout.
|
||||
acc_d_rmem_layout = cute.make_fragment_like(
|
||||
tTR_gC[(None, None, None, 0, 0)].layout
|
||||
acc_d_rmem_layout = cute_ext.make_t2r_rmem_layout(
|
||||
tiled_copy_t2r, gC_mnl_epi, tid_x
|
||||
)
|
||||
|
||||
# ========================================================================================
|
||||
|
||||
@@ -200,10 +200,9 @@ def sm100_4x4x1_kernel_builder(
|
||||
TMA_STORE_PIPE_DEPTH,
|
||||
)
|
||||
|
||||
acc_shape = tiled_mma.partition_shape_C(mnk_tiler[:2])
|
||||
tmem_layout = tiled_mma.make_fragment_C(
|
||||
cute.append(acc_shape, EPILOGUE_STAGE_DEPTH)
|
||||
).layout
|
||||
tmem_layout = cute_ext.make_tmem_layout_acc(
|
||||
tiled_mma, mnk_tiler, EPILOGUE_STAGE_DEPTH
|
||||
)
|
||||
|
||||
bufferA = cute_ext.allocate(
|
||||
ab_dtype,
|
||||
@@ -251,11 +250,9 @@ def sm100_4x4x1_kernel_builder(
|
||||
tiled_copy_t2r = cute.nvgpu.tcgen05.make_tmem_copy(copy_atom_t2r, acc_epi_div)
|
||||
|
||||
# Calculate the per thread destination size per iteration for output of TMEM and input of SMEM
|
||||
thr_copy_t2r = tiled_copy_t2r.get_slice(tid_x)
|
||||
gC_mnl_epi = cute.flat_divide(tDgD, epi_tile)
|
||||
tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
|
||||
acc_d_rmem_layout = cute.make_fragment_like(
|
||||
tTR_gC[(None, None, None, 0, 0)].layout
|
||||
acc_d_rmem_layout = cute_ext.make_t2r_rmem_layout(
|
||||
tiled_copy_t2r, gC_mnl_epi, tid_x
|
||||
)
|
||||
|
||||
bufferRAcc = cute_ext.allocate(
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -198,10 +198,7 @@ class DenseGemmPtrArrayKernel:
|
||||
)
|
||||
|
||||
# UMMA ACC TMEM Layout
|
||||
acc_shape = tiled_mma.partition_shape_C(mnk_tiler[:2])
|
||||
tmem_layout = tiled_mma.make_fragment_C(
|
||||
cute.append(acc_shape, acc_stage)
|
||||
).layout
|
||||
tmem_layout = cute_ext.make_tmem_layout_acc(tiled_mma, mnk_tiler, acc_stage)
|
||||
|
||||
# Allocate UMMA Buffers
|
||||
bufferA = cute_ext.allocate(
|
||||
@@ -251,11 +248,9 @@ class DenseGemmPtrArrayKernel:
|
||||
tiled_copy_t2r = cute.nvgpu.tcgen05.make_tmem_copy(copy_atom_t2r, acc_epi_div)
|
||||
|
||||
# Calculate the per thread destination size per iteration for output of TMEM and input of SMEM
|
||||
thr_copy_t2r = tiled_copy_t2r.get_slice(tid_x)
|
||||
gC_mnl_epi = cute.flat_divide(gD_tile, epi_tile)
|
||||
tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
|
||||
acc_d_rmem_layout = cute.make_fragment_like(
|
||||
tTR_gC[(None, None, None, 0, 0)].layout
|
||||
acc_d_rmem_layout = cute_ext.make_t2r_rmem_layout(
|
||||
tiled_copy_t2r, gC_mnl_epi, tid_x
|
||||
)
|
||||
|
||||
# Allocate RMEM buffers
|
||||
|
||||
Reference in New Issue
Block a user