From 7c60edcc0a136cf836963262a8f9aad9960d5f7b Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 1 Apr 2026 08:16:04 -0500 Subject: [PATCH 1/9] cuda.core.experimental -> cuda.core --- python/examples/axes.py | 2 +- python/examples/cpu_activity.py | 2 +- python/examples/exec_tag_sync.py | 2 +- python/examples/skip.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/examples/axes.py b/python/examples/axes.py index ce67238..fa7c6c4 100644 --- a/python/examples/axes.py +++ b/python/examples/axes.py @@ -20,7 +20,7 @@ from typing import Dict, Optional, Tuple import cuda.bench as bench import cuda.cccl.headers as headers -import cuda.core.experimental as core +import cuda.core as core def as_core_Stream(cs: bench.CudaStream) -> core.Stream: diff --git a/python/examples/cpu_activity.py b/python/examples/cpu_activity.py index a492ff7..df1efae 100644 --- a/python/examples/cpu_activity.py +++ b/python/examples/cpu_activity.py @@ -19,7 +19,7 @@ import time import cuda.bench as bench import cuda.cccl.headers as headers -import cuda.core.experimental as core +import cuda.core as core host_sleep_duration = 0.1 diff --git a/python/examples/exec_tag_sync.py b/python/examples/exec_tag_sync.py index b9ab5ef..8d57311 100644 --- a/python/examples/exec_tag_sync.py +++ b/python/examples/exec_tag_sync.py @@ -20,7 +20,7 @@ from typing import Optional import cuda.bench as bench import cuda.cccl.headers as headers -import cuda.core.experimental as core +import cuda.core as core def as_core_Stream(cs: bench.CudaStream) -> core.Stream: diff --git a/python/examples/skip.py b/python/examples/skip.py index cf7ec90..f720003 100644 --- a/python/examples/skip.py +++ b/python/examples/skip.py @@ -18,7 +18,7 @@ import sys import cuda.bench as bench import cuda.cccl.headers as headers -import cuda.core.experimental as core +import cuda.core as core def as_core_Stream(cs: bench.CudaStream) -> core.Stream: From 974eb5ee0fa26645fa007574c68278628fd8a0e5 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 1 Apr 2026 08:17:12 -0500 Subject: [PATCH 2/9] Replace use of cupy.cuda.ExternalStream with cupy.cuda.Stream.from_external --- python/examples/cupy_extract.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/python/examples/cupy_extract.py b/python/examples/cupy_extract.py index 091141c..d7f2a01 100644 --- a/python/examples/cupy_extract.py +++ b/python/examples/cupy_extract.py @@ -20,21 +20,16 @@ import cuda.bench as bench import cupy as cp -def as_cp_ExternalStream( - cs: bench.CudaStream, dev_id: int | None = -1 -) -> cp.cuda.ExternalStream: - h = cs.addressof() - return cp.cuda.ExternalStream(h, dev_id) +def as_cp_ExternalStream(cs: bench.CudaStream): + return cp.cuda.Stream.from_external(cs) def cupy_extract_by_mask(state: bench.State): n_cols = state.get_int64("numCols") n_rows = state.get_int64("numRows") - dev_id = state.get_device() - cp_s = as_cp_ExternalStream(state.get_stream(), dev_id) + cp_s = as_cp_ExternalStream(state.get_stream()) - state.collect_cupti_metrics() state.add_element_count(n_rows * n_cols, "# Elements") int32_dt = cp.dtype(cp.int32) bool_dt = cp.dtype(cp.bool_) @@ -49,7 +44,7 @@ def cupy_extract_by_mask(state: bench.State): _ = X[mask] def launcher(launch: bench.Launch): - with as_cp_ExternalStream(launch.get_stream(), dev_id): + with as_cp_ExternalStream(launch.get_stream()): _ = X[mask] state.exec(launcher, sync=True) From d8739fc208bcbae327e8e2fa5848500baad76a84 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 1 Apr 2026 08:17:52 -0500 Subject: [PATCH 3/9] Update to cccl_cooperative_block_reduce example --- python/examples/cccl_cooperative_block_reduce.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/examples/cccl_cooperative_block_reduce.py b/python/examples/cccl_cooperative_block_reduce.py index 0d5d970..c0f0138 100644 --- a/python/examples/cccl_cooperative_block_reduce.py +++ b/python/examples/cccl_cooperative_block_reduce.py @@ -17,7 +17,7 @@ import sys import cuda.bench as bench -import cuda.cccl.cooperative.experimental as coop +import cuda.coop as coop import numba import numpy as np from numba import cuda From 5bdb30f4b606779a3dadd4edad385f4e49d52443 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 1 Apr 2026 08:18:15 -0500 Subject: [PATCH 4/9] Update to cccl_parallel_segmented_reduce example per changes in API Update namespace changes. Use make_segmented_reduce factory function, and update call signatures. --- .../cccl_parallel_segmented_reduce.py | 55 ++++++++----------- 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/python/examples/cccl_parallel_segmented_reduce.py b/python/examples/cccl_parallel_segmented_reduce.py index e54a77b..d2140b9 100644 --- a/python/examples/cccl_parallel_segmented_reduce.py +++ b/python/examples/cccl_parallel_segmented_reduce.py @@ -17,36 +17,20 @@ import sys import cuda.bench as bench -import cuda.cccl.parallel.experimental.algorithms as algorithms -import cuda.cccl.parallel.experimental.iterators as iterators -import cuda.core.experimental as core +import cuda.compute.algorithms as algorithms +import cuda.compute.iterators as iterators +import cuda.core as core import cupy as cp import numpy as np - - -class CCCLStream: - "Class to work around https://github.com/NVIDIA/cccl/issues/5144" - - def __init__(self, ptr): - self._ptr = ptr - - def __cuda_stream__(self): - return (0, self._ptr) +from cuda.compute import OpKind def as_core_Stream(cs: bench.CudaStream) -> core.Stream: return core.Stream.from_handle(cs.addressof()) -def as_cccl_Stream(cs: bench.CudaStream) -> CCCLStream: - return CCCLStream(cs.addressof()) - - -def as_cp_ExternalStream( - cs: bench.CudaStream, dev_id: int | None = -1 -) -> cp.cuda.ExternalStream: - h = cs.addressof() - return cp.cuda.ExternalStream(h, dev_id) +def as_cp_ExternalStream(cs: bench.CudaStream) -> cp.cuda.ExternalStream: + return cp.cuda.Stream.from_external(cs) def segmented_reduce(state: bench.State): @@ -56,13 +40,8 @@ def segmented_reduce(state: bench.State): n_rows = n_elems // n_cols state.add_summary("numRows", n_rows) - state.collect_cupti_metrics() - dev_id = state.get_device() - cp_stream = as_cp_ExternalStream(state.get_stream(), dev_id) - - def add_op(a, b): - return a + b + cp_stream = as_cp_ExternalStream(state.get_stream()) def make_scaler(step): def scale(row_id): @@ -85,15 +64,24 @@ def segmented_reduce(state: bench.State): d_input = mat d_output = cp.empty(n_rows, dtype=d_input.dtype) - alg = algorithms.segmented_reduce( + add_op = OpKind.PLUS + + alg = algorithms.make_segmented_reduce( d_input, d_output, start_offsets, end_offsets, add_op, h_init ) - cccl_stream = as_cccl_Stream(state.get_stream()) - + cccl_stream = state.get_stream() # query size of temporary storage and allocate temp_nbytes = alg( - None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init, cccl_stream + None, + d_input, + d_output, + add_op, + n_rows, + start_offsets, + end_offsets, + h_init, + cccl_stream, ) h_init = np.zeros(tuple(), dtype=np.int32) @@ -101,11 +89,12 @@ def segmented_reduce(state: bench.State): temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8) def launcher(launch: bench.Launch): - s = as_cccl_Stream(launch.get_stream()) + s = launch.get_stream() alg( temp_storage, d_input, d_output, + add_op, n_rows, start_offsets, end_offsets, From 3f284b4004b44cc05609a5f48f0a527bb32f613f Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 1 Apr 2026 08:20:20 -0500 Subject: [PATCH 5/9] Renamed cccl_* examples cccl_parallel_* -> cuda_compute_* cccl_cooperative_* -> cuda_coop_* --- ...allel_segmented_reduce.py => cuda_compute_segmented_reduce.py} | 0 ...cccl_cooperative_block_reduce.py => cuda_coop_block_reduce.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename python/examples/{cccl_parallel_segmented_reduce.py => cuda_compute_segmented_reduce.py} (100%) rename python/examples/{cccl_cooperative_block_reduce.py => cuda_coop_block_reduce.py} (100%) diff --git a/python/examples/cccl_parallel_segmented_reduce.py b/python/examples/cuda_compute_segmented_reduce.py similarity index 100% rename from python/examples/cccl_parallel_segmented_reduce.py rename to python/examples/cuda_compute_segmented_reduce.py diff --git a/python/examples/cccl_cooperative_block_reduce.py b/python/examples/cuda_coop_block_reduce.py similarity index 100% rename from python/examples/cccl_cooperative_block_reduce.py rename to python/examples/cuda_coop_block_reduce.py From e4cfddeb878577cf2dd2b6bb0658603ab7592839 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 1 Apr 2026 08:23:41 -0500 Subject: [PATCH 6/9] Rewrote cutlass_gemm example to use CuteDSL --- python/examples/cutlass_gemm.py | 603 ++++++++++++++++++++++++++++++-- 1 file changed, 570 insertions(+), 33 deletions(-) diff --git a/python/examples/cutlass_gemm.py b/python/examples/cutlass_gemm.py index cd62f39..61ac2b3 100644 --- a/python/examples/cutlass_gemm.py +++ b/python/examples/cutlass_gemm.py @@ -16,13 +16,18 @@ import sys +from typing import Tuple import cuda.bench as bench import cuda.bindings.driver as driver -import cuda.core.experimental as core +import cuda.core as core import cupy as cp import cutlass +import cutlass.cute as cute +import cutlass.pipeline as pipeline +import cutlass.utils as utils import numpy as np +from cutlass.cute.runtime import from_dlpack def as_bindings_Stream(cs: bench.CudaStream) -> driver.CUstream: @@ -33,31 +38,578 @@ def as_core_Stream(cs: bench.CudaStream) -> core.Stream: return core.Stream.from_handle(cs.addressof()) -def make_cp_array( - arr_h: np.ndarray, dev_buf: core.Buffer, dev_id: int | None -) -> cp.ndarray: +def make_tensor(arr_h: np.ndarray, dev_buf: core.Buffer, dev_id: int | None): cp_memview = cp.cuda.UnownedMemory( int(dev_buf.handle), dev_buf.size, dev_buf, -1 if dev_id is None else dev_id ) zero_offset = 0 - return cp.ndarray( - arr_h.shape, - dtype=arr_h.dtype, - memptr=cp.cuda.MemoryPointer(cp_memview, zero_offset), + return from_dlpack( + cp.ndarray( + arr_h.shape, + dtype=arr_h.dtype, + memptr=cp.cuda.MemoryPointer(cp_memview, zero_offset), + ), + assumed_align=16, ) +class SGemm: + """ + Adapted from https://github.com/NVIDIA/cutlass/blob/main/examples/python/CuTeDSL/ampere/sgemm.py + """ + + def __init__( + self, + cta_tiler: Tuple[int, int, int] = (128, 128, 8), + num_stages: int = 3, + num_threads: int = 256, + ): + self._cta_tiler = cta_tiler + self._num_stages = num_stages + self._num_threads = num_threads + assert num_threads > 0, "needs at least one thread" + assert num_threads % 16 == 0, "multiples of 16 required for MMA thread layout" + + self._bM, self._bN, self._bK = self._cta_tiler + assert self._bM % 16 == 0, "multiple of 16 required for tile dimension M" + assert self._bN % 16 == 0, "multiple of 16 required for tile dimension N" + assert self._num_stages >= 3, "num_stages must be greater than or equal to 3" + self.cta_sync_barrier = pipeline.NamedBarrier( + barrier_id=1, num_threads=num_threads + ) + + @cute.jit + def __call__( + self, + mA: cute.Tensor, + mB: cute.Tensor, + mC: cute.Tensor, + epilogue_op: cutlass.Constexpr = lambda x: x, + stream: driver.CUstream = driver.CUstream( + driver.CUstream_flags.CU_STREAM_DEFAULT + ), + ): + self.a_major_mode = utils.LayoutEnum.from_tensor(mA) + self.b_major_mode = utils.LayoutEnum.from_tensor(mB) + self.c_major_mode = utils.LayoutEnum.from_tensor(mC) + + # /////////////////////////////////////////////////////////////////////////////// + # Create layouts for shared memory for A and B: + # - sA/sB is m/n-major to vectorized copies from shared + # memory to registers. This is because the MMA layouts + # for sA/sB are also m/n-major + # - When gA/gB is k-major, pad 4 elements to reduce bank conflicts + # /////////////////////////////////////////////////////////////////////////////// + + padding_a = 4 if self.a_major_mode == utils.LayoutEnum.ROW_MAJOR else 0 + padding_b = 4 if self.b_major_mode == utils.LayoutEnum.ROW_MAJOR else 0 + sA_layout = cute.make_layout( + (self._bM, self._bK, self._num_stages), + stride=(1, (self._bM + padding_a), self._bK * (self._bM + padding_a)), + ) + sB_layout = cute.make_layout( + (self._bN, self._bK, self._num_stages), + stride=(1, (self._bN + padding_b), self._bK * (self._bN + padding_b)), + ) + + # /////////////////////////////////////////////////////////////////////////////// + # Create copy layouts that will be used for asynchronous + # global memory -> shared memory copies: + # - The majorness of tA/tB follows the majorness of gA/gB + # - For k-major, these layouts will copy values one-by-one from + # from global memory, without vectorizing + # - For m/n-major, it will vectorize to a 128bit copy for faster + # data transfer between global and shared memory, as long + # as the alignment of the tensor allows it. Otherwise, it + # defaults to a non-vectorized copy + # /////////////////////////////////////////////////////////////////////////////// + + tA = cute.make_layout( + (self._num_threads // self._bK, self._bK), stride=(self._bK, 1) + ) + tB = cute.make_layout( + (self._num_threads // self._bK, self._bK), stride=(self._bK, 1) + ) + vA = cute.make_layout((1, 1)) + vB = cute.make_layout((1, 1)) + atom_async_copy_A = cute.make_copy_atom( + cute.nvgpu.cpasync.CopyG2SOp(), + mA.element_type, + num_bits_per_copy=mA.element_type.width, + ) + atom_async_copy_B = cute.make_copy_atom( + cute.nvgpu.cpasync.CopyG2SOp(), + mA.element_type, + num_bits_per_copy=mB.element_type.width, + ) + if cutlass.const_expr(self.a_major_mode == utils.LayoutEnum.COL_MAJOR): + num_vectorized = 4 if (mA.layout[0].max_alignment % 16 == 0) else 1 + atom_async_copy_A = cute.make_copy_atom( + cute.nvgpu.cpasync.CopyG2SOp(), + mA.element_type, + num_bits_per_copy=mA.element_type.width * num_vectorized, + ) + major_mode_size = self._bM // num_vectorized + tA = cute.make_layout( + (major_mode_size, self._num_threads // major_mode_size), + stride=(1, major_mode_size), + ) + vA = cute.make_layout((num_vectorized, 1)) + + if cutlass.const_expr(self.b_major_mode == utils.LayoutEnum.COL_MAJOR): + num_vectorized = 4 if (mB.layout[0].max_alignment % 16 == 0) else 1 + atom_async_copy_B = cute.make_copy_atom( + cute.nvgpu.cpasync.CopyG2SOp(), + mA.element_type, + num_bits_per_copy=mB.element_type.width * num_vectorized, + ) + major_mode_size = self._bN // num_vectorized + tB = cute.make_layout( + (major_mode_size, self._num_threads // major_mode_size), + stride=(1, major_mode_size), + ) + vB = cute.make_layout((num_vectorized, 1)) + + tiled_copy_A = cute.make_tiled_copy_tv(atom_async_copy_A, tA, vA) + tiled_copy_B = cute.make_tiled_copy_tv(atom_async_copy_B, tB, vB) + + # /////////////////////////////////////////////////////////////////////////////// + # Create layouts for GEMM: + # We tile an MMA atom across a tensor. `atoms_layout` is the layout + # of atoms in the tiled MMA. (Because we use an `MmaUniversalOp`, + # which has a trivial 1x1x1 MMA trait, `atoms_layout` is also + # simply the thread layout for C.) `permutation_tiler` reorders the + # elements of the tensor that the tiled MMA is applied to. + # Different combinations of `atoms_layout` and `permutation_tiler` + # values can create different MMA thread-value patterns. + # + # Here, the MMA layout is set so that each thread copies four + # consecutive elements from shared memory to registers. + # `permutation_tiler_M/N` maps the elements handled by each thread + # to the permuted element in the tensor. + # For increasing indices in the tensor, the thread ID that reads it is: + # - (without permutation) ==> + # 0 1 2 ... 15 0 1 2 ... 15 0 1 2 ... 15 0 1 2 ... 15 ...... + # - (with permutation) ==> + # 0 0 0 0 1 1 1 1 2 2 2 2 ... 15 15 15 15 0 0 0 0 1 1 1 1 ...... + # /////////////////////////////////////////////////////////////////////////////// + atoms_layout = cute.make_layout( + (self._num_threads // 16, 16, 1), stride=(16, 1, 0) + ) + if cutlass.const_expr(self.c_major_mode == utils.LayoutEnum.COL_MAJOR): + atoms_layout = cute.make_layout( + (16, self._num_threads // 16, 1), stride=(1, 16, 0) + ) + op = cute.nvgpu.MmaUniversalOp(cutlass.Float32) + permutation_tiler_M = cute.make_layout( + (atoms_layout.shape[0], 4), stride=(4, 1) + ) + permutation_tiler_N = cute.make_layout( + (atoms_layout.shape[1], 4), stride=(4, 1) + ) + tiled_mma = cute.make_tiled_mma( + op, + atoms_layout, + permutation_mnk=(permutation_tiler_M, permutation_tiler_N, None), + ) + + # grid_dim: ((m + BLK_M - 1) // BLK_M, (n + BLK_N - 1) // BLK_N, 1) + grid_dim = *cute.ceil_div(mC.shape, (self._bM, self._bN)), 1 + + self.kernel( + mA, + mB, + mC, + sA_layout, + sB_layout, + tiled_copy_A, + tiled_copy_B, + tiled_mma, + epilogue_op, + ).launch( + grid=grid_dim, + block=[cute.size(atoms_layout), 1, 1], + stream=stream, + ) + + @cute.kernel + def kernel( + self, + mA: cute.Tensor, + mB: cute.Tensor, + mC: cute.Tensor, + sA_layout: cute.Layout, + sB_layout: cute.Layout, + tiled_copy_A: cute.TiledCopy, + tiled_copy_B: cute.TiledCopy, + tiled_mma: cute.TiledMma, + epilogue_op: cutlass.Constexpr = lambda x: x, + ): + # Thread and block indices + tidx, tidy, tidz = cute.arch.thread_idx() + bidx, bidy, bidz = cute.arch.block_idx() + tiler_coord = (bidx, bidy, None) + thr_mma = tiled_mma.get_slice(tidx) + + # /////////////////////////////////////////////////////////////////////////////// + # Get the appropriate tiles for this thread block. + # gA: (BLK_M, BLK_K, k), gB: (BLK_N, BLK_K, k), gC: (BLK_M, BLK_N) + # /////////////////////////////////////////////////////////////////////////////// + gA = cute.local_tile( + mA, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, None, 1) + ) + gB = cute.local_tile( + mB, tiler=self._cta_tiler, coord=tiler_coord, proj=(None, 1, 1) + ) + gC = cute.local_tile( + mC, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, 1, None) + ) + + # Move the pointer of gA/gB in the `-k`` direction, making the first + # tile (instead of the last one) irregular in shape when k is irregular. + # We first handle the irregular tile to avoid checking for this + # condition within the mainloop. + residue_k = mA.shape[1] - self._bK * gA.shape[2] + gA = cute.domain_offset((0, residue_k, 0), gA) + gB = cute.domain_offset((0, residue_k, 0), gB) + + # /////////////////////////////////////////////////////////////////////////////// + # Get the appropriate tiles for this thread. + # sA: (BLK_M, BLK_K, PIPE) , sB: (BLK_N, BLK_K, PIPE) + # tAgA: (CPY, CPY_M, CPY_K, k) , tBgB: (CPY, CPY_N, CPY_K, k) + # tAsA: (CPY, CPY_M, CPY_K, PIPE) , tBsB: (CPY, CPY_N, CPY_K, PIPE) + # /////////////////////////////////////////////////////////////////////////////// + # Create shared memory buffer + smem = cutlass.utils.SmemAllocator() + sA = smem.allocate_tensor(mA.element_type, sA_layout, 16) + sB = smem.allocate_tensor(mB.element_type, sB_layout, 16) + thr_copy_A = tiled_copy_A.get_slice(tidx) + thr_copy_B = tiled_copy_B.get_slice(tidx) + tAgA = thr_copy_A.partition_S(gA) + tAsA = thr_copy_A.partition_D(sA) + tBgB = thr_copy_B.partition_S(gB) + tBsB = thr_copy_B.partition_D(sB) + + # /////////////////////////////////////////////////////////////////////////////// + # Predicate: Mark indices that need to copy when the problem shape + # isn't a multiple of the tile shape. If tApA/B[i] is 0, then do not + # do the copy atom associated with index i. + # cA: (BLK_M, BLK_K) => (blk_m, blk_k) + # cB: (BLK_N, BLK_K) => (blk_n, blk_k) + # tAcA: (CPY, CPY_M, CPY_K) => (blk_m, blk_k) + # tBcB: (CPY, CPY_N, CPY_K) => (blk_n, blk_k) + # tApA: (rest_v, CPY_M, CPY_K), stride=(..., ..., 0) + # tBpB: (rest_v, CPY_N, CPY_K), stride=(..., ..., 0) + # CPY = (atom_v, rest_v) + # /////////////////////////////////////////////////////////////////////////////// + # Construct identity layout for sA and sB, used for predication + mcA = cute.make_identity_tensor(mA.shape) + mcB = cute.make_identity_tensor(mB.shape) + cA = cute.local_tile( + mcA, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, None, 1) + ) + cB = cute.local_tile( + mcB, tiler=self._cta_tiler, coord=tiler_coord, proj=(None, 1, 1) + ) + cA = cute.domain_offset((0, residue_k, 0), cA) + cB = cute.domain_offset((0, residue_k, 0), cB) + # Repeat the partitioning with identity layouts + tAcA = thr_copy_A.partition_S(cA) + tBcB = thr_copy_B.partition_S(cB) + # Allocate predicate tensors for m and n + tApA = cute.make_rmem_tensor( + cute.make_layout( + ( + tAsA.shape[0][1], + cute.size(tAsA, mode=[1]), + cute.size(tAsA, mode=[2]), + ), + stride=(cute.size(tAsA, mode=[1]), 1, 0), + ), + cutlass.Boolean, + ) + tBpB = cute.make_rmem_tensor( + cute.make_layout( + ( + tBsB.shape[0][1], + cute.size(tBsB, mode=[1]), + cute.size(tBsB, mode=[2]), + ), + stride=(cute.size(tBsB, mode=[1]), 1, 0), + ), + cutlass.Boolean, + ) + # Allocate predicate tensors for m, n and k for residue k-tile + tApA_residue_k = cute.make_rmem_tensor( + cute.make_layout( + ( + tAsA.shape[0][1], + cute.size(tAsA, mode=[1]), + cute.size(tAsA, mode=[2]), + ), + stride=( + cute.size(tAsA, mode=[1]) * cute.size(tAsA, mode=[2]), + cute.size(tAsA, mode=[2]), + 1, + ), + ), + cutlass.Boolean, + ) + tBpB_residue_k = cute.make_rmem_tensor( + cute.make_layout( + ( + tBsB.shape[0][1], + cute.size(tBsB, mode=[1]), + cute.size(tBsB, mode=[2]), + ), + stride=( + cute.size(tBsB, mode=[1]) * cute.size(tBsB, mode=[2]), + cute.size(tBsB, mode=[2]), + 1, + ), + ), + cutlass.Boolean, + ) + # Set predicates for m/n bounds for mainloop + for rest_v in range(tApA.shape[0]): + for m in range(tApA.shape[1]): + tApA[rest_v, m, 0] = cute.elem_less( + tAcA[(0, rest_v), m, 0, 0][0], mA.shape[0] + ) + for rest_v in range(tBpB.shape[0]): + for n in range(tBpB.shape[1]): + tBpB[rest_v, n, 0] = cute.elem_less( + tBcB[(0, rest_v), n, 0, 0][0], mB.shape[0] + ) + + # Set predicates for m/n/k bounds for residue k tile + for rest_v in range(tApA_residue_k.shape[0]): + for m in range(tApA_residue_k.shape[1]): + for k in range(tApA_residue_k.shape[2]): + coord_A = tAcA[(0, rest_v), m, k, 0] + tApA_residue_k[rest_v, m, k] = cute.elem_less( + (coord_A[0], cutlass.Int32(-1)), (mA.shape[0], coord_A[1]) + ) + for rest_v in range(tBpB_residue_k.shape[0]): + for n in range(tBpB_residue_k.shape[1]): + for k in range(tBpB_residue_k.shape[2]): + coord_B = tBcB[(0, rest_v), n, k, 0] + tBpB_residue_k[rest_v, n, k] = cute.elem_less( + (coord_B[0], cutlass.Int32(-1)), (mB.shape[0], coord_B[1]) + ) + + # /////////////////////////////////////////////////////////////////////////////// + # Prefetch Prologue + # /////////////////////////////////////////////////////////////////////////////// + # Start async loads for 0th k-tile, where we take care of the k-residue + k_pipe_max = cute.size(tAsA, mode=[3]) + k_tile_count = cute.size(tAgA, mode=[3]) + gmem_pipe_read = cutlass.Int32(0) + cute.copy( + tiled_copy_A, + tAgA[None, None, None, gmem_pipe_read], + tAsA[None, None, None, 0], + pred=tApA_residue_k, + ) + cute.copy( + tiled_copy_B, + tBgB[None, None, None, gmem_pipe_read], + tBsB[None, None, None, 0], + pred=tBpB_residue_k, + ) + cute.arch.cp_async_commit_group() + gmem_pipe_read = ( + gmem_pipe_read + 1 + if gmem_pipe_read + 1 < k_tile_count + else cutlass.Int32(0) + ) + # Start async loads for 1st k-tile onwards, no k-residue handling needed + for k_tile in range(1, k_pipe_max - 1): + if k_tile < k_tile_count: + cute.copy( + tiled_copy_A, + tAgA[None, None, None, gmem_pipe_read], + tAsA[None, None, None, k_tile], + pred=tApA, + ) + cute.copy( + tiled_copy_B, + tBgB[None, None, None, gmem_pipe_read], + tBsB[None, None, None, k_tile], + pred=tBpB, + ) + + gmem_pipe_read = ( + gmem_pipe_read + 1 + if gmem_pipe_read + 1 < k_tile_count + else cutlass.Int32(0) + ) + cute.arch.cp_async_commit_group() + + # all tiles have been copied from global memory, so clear the + # predicate tensor + if k_tile_count < k_pipe_max: + for rest_v in range(tApA.shape[0]): + for m in range(tApA.shape[1]): + tApA[rest_v, m, 0] = cutlass.Boolean(0) + for rest_v in range(tBpB.shape[0]): + for n in range(tBpB.shape[1]): + tBpB[rest_v, n, 0] = cutlass.Boolean(0) + + # /////////////////////////////////////////////////////////////////////////////// + # Define A/B partitioning and C accumulators. + # /////////////////////////////////////////////////////////////////////////////// + tCsA = thr_mma.partition_A(sA) + tCsB = thr_mma.partition_B(sB) + tCgC = thr_mma.partition_C(gC) + tCrA = tiled_mma.make_fragment_A(tCsA[None, None, None, 0]) + tCrB = tiled_mma.make_fragment_B(tCsB[None, None, None, 0]) + tCrC = tiled_mma.make_fragment_C(tCgC) + # Clear the accumulator + tCrC.fill(0.0) + + # Current pipe index in smem to read from / write to + smem_pipe_read = cutlass.Int32(0) + smem_pipe_write = cutlass.Int32(k_pipe_max - 1) + + tCsA_p = tCsA[None, None, None, smem_pipe_read] + tCsB_p = tCsB[None, None, None, smem_pipe_read] + + # /////////////////////////////////////////////////////////////////////////////// + # PREFETCH register pipeline + # /////////////////////////////////////////////////////////////////////////////// + k_block_max = cute.size(tCrA, mode=[2]) + + if k_block_max > 1: + # Wait until our first prefetched tile is loaded in + cute.arch.cp_async_wait_group(k_pipe_max - 2) + self.cta_sync_barrier.arrive_and_wait() + # Prefetch the first rmem from the first k-tile + cute.autovec_copy(tCsA_p[None, None, 0], tCrA[None, None, 0]) + cute.autovec_copy(tCsB_p[None, None, 0], tCrB[None, None, 0]) + + # /////////////////////////////////////////////////////////////////////////////// + # Mainloop + # 1. Shared memory pipeline (gmem -> smem): + # The default smem pipeline depth is 3, meaning that for shared + # memory buffers, we allocate three times the size described by the + # CTA tiler. We prefetch 2 of these buffers before entering the main + # loop. Considering only the transfer from global memory to shared + # memory, the general structure of the mainloop is: + # (1) copy k-tile from gmem to smem; + # (2) perform gemm computation on k-tile; + # (3) wait for the next copy to finish. + # The `cute.arch.cp_async_wait_group(num_smem_stages - 2)` command + # waits for the number of unfinished 'copy' to be <= 1. The advantage + # of this approach is that it allows for simultaneous production + # (i.e., step (1)) and consumption (i.e., step (2)) of smem. + # A common misconception is to prefetch N buffers and rewrite + # the pipeline logic to wait on N-1 pending copies. The disadvantage + # of this approach is that it requires fully consuming a buffer in + # order to open an empty buffer for the next copy. + # 2. Register pipeline (smem -> register): + # Similarly, the register pipeline produces i+1, consumes i, and + # produces i+2... Notably, i and i+1 do not use the same register, + # eliminating dependencies on the same register for better parallelism. + # 3. Combining the smem and register pipelines results in the mainloop. + # /////////////////////////////////////////////////////////////////////////////// + + for _ in range(k_tile_count): + for k_block in range(k_block_max, unroll_full=True): + if k_block == k_block_max - 1: + tCsA_p = tCsA[None, None, None, smem_pipe_read] + tCsB_p = tCsB[None, None, None, smem_pipe_read] + cute.arch.cp_async_wait_group(k_pipe_max - 2) + self.cta_sync_barrier.arrive_and_wait() + + # Load A, B from shared memory to registers for k_block + 1 + k_block_next = (k_block + 1) % k_block_max # static + cute.autovec_copy( + tCsA_p[None, None, k_block_next], + tCrA[None, None, k_block_next], + ) + cute.autovec_copy( + tCsB_p[None, None, k_block_next], + tCrB[None, None, k_block_next], + ) + + # Fetch next A: To better interleave global memory access and + # compute instructions, we intentionally use the sequence: + # copy A, perform GEMM, then copy B. + if k_block == 0: + cute.copy( + tiled_copy_A, + tAgA[None, None, None, gmem_pipe_read], + tAsA[None, None, None, smem_pipe_write], + # Use predicates because the m-mode may be irregular + pred=tApA, + ) + + # Thread-level register gemm for k_block + cute.gemm( + tiled_mma, + tCrC, + tCrA[None, None, k_block], + tCrB[None, None, k_block], + tCrC, + ) + + # Fetch next B and update smem pipeline read/write + if k_block == 0: + cute.copy( + tiled_copy_B, + tBgB[None, None, None, gmem_pipe_read], + tBsB[None, None, None, smem_pipe_write], + # Use predicates because the n-mode may be irregular + pred=tBpB, + ) + cute.arch.cp_async_commit_group() + smem_pipe_write = smem_pipe_read + smem_pipe_read = smem_pipe_read + 1 + if smem_pipe_read == k_pipe_max: + smem_pipe_read = cutlass.Int32(0) + # After copying all tiles, we avoid clearing the predicate + # tensor in the `mainloop` to prevent increasing its + # instruction count. Instead, we continue copying the + # first tile, though it won't be used. The 0-th tile is not + # copied due to its irregular shape, which could lead to + # illegal memory accesses. + gmem_pipe_read = ( + gmem_pipe_read + 1 + if gmem_pipe_read + 1 < k_tile_count + else cutlass.Int32(1) + ) + + # /////////////////////////////////////////////////////////////////////////////// + # Epilogue + # Applies the epilogue operation to the accumulated results and copies + # them without vectorization. + # /////////////////////////////////////////////////////////////////////////////// + cute.arch.cp_async_wait_group(0) + self.cta_sync_barrier.arrive_and_wait() + tCrC.store(epilogue_op(tCrC.load())) + + # predicate + cC = cute.make_identity_tensor(gC.shape) + tCpC = thr_mma.partition_C(cC) + predC = cute.make_rmem_tensor(tCrC.layout, cutlass.Boolean) + residue_m = mC.shape[0] - cutlass.Int32(self._bM) * bidx + residue_n = mC.shape[1] - cutlass.Int32(self._bN) * bidy + for i in range(cute.size(tCrC.shape)): + predC[i] = cute.elem_less(tCpC[i], (residue_m, residue_n)) + atom = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), mC.element_type) + cute.copy(atom, tCrC, tCgC, pred=predC) + return + + def cutlass_gemm(state: bench.State) -> None: n = state.get_int64("N") r = state.get_int64("R") - alpha = state.get_float64("alpha") - - dt = np.float64 + dt = np.float32 A_h = np.random.randn(n, r).astype(dt) B_h = np.copy(A_h.mT) C_h = np.eye(n, dtype=dt) - D_h = np.zeros_like(C_h) if n >= 1024: # allow more time for large inputs @@ -71,34 +623,21 @@ def cutlass_gemm(state: bench.State) -> None: A_d = core.DeviceMemoryResource(dev_id).allocate(A_h.nbytes, core_s) B_d = core.DeviceMemoryResource(dev_id).allocate(B_h.nbytes, core_s) C_d = core.DeviceMemoryResource(dev_id).allocate(C_h.nbytes, core_s) - D_d = core.DeviceMemoryResource(dev_id).allocate(D_h.nbytes, core_s) driver.cuMemcpyAsync(A_d.handle, A_h.ctypes.data, A_h.nbytes, s) driver.cuMemcpyAsync(B_d.handle, B_h.ctypes.data, B_h.nbytes, s) driver.cuMemcpyAsync(C_d.handle, C_h.ctypes.data, C_h.nbytes, s) - driver.cuMemcpyAsync(D_d.handle, D_h.ctypes.data, D_h.nbytes, s) - A_cp = make_cp_array(A_h, A_d, dev_id) - B_cp = make_cp_array(B_h, B_d, dev_id) - C_cp = make_cp_array(C_h, C_d, dev_id) - D_cp = make_cp_array(D_h, D_d, dev_id) + A_cp = make_tensor(A_h, A_d, dev_id) + B_cp = make_tensor(B_h, B_d, dev_id) + C_cp = make_tensor(C_h, C_d, dev_id) - plan = cutlass.op.Gemm( - A=A_cp, - B=B_cp, - C=C_cp, - D=D_cp, - element=dt, - alpha=alpha, - beta=1, - layout=cutlass.LayoutType.RowMajor, - ) - # warm-up to ensure compilation is not timed - plan.run(stream=s) + sgemm = SGemm() + _ = sgemm(A_cp, B_cp, C_cp, stream=s) def launcher(launch: bench.Launch) -> None: s = as_bindings_Stream(launch.get_stream()) - plan.run(stream=s, sync=False) + sgemm(A_cp, B_cp, C_cp, stream=s) state.exec(launcher) @@ -108,6 +647,4 @@ if __name__ == "__main__": gemm_b.add_int64_axis("R", [16, 64, 256]) gemm_b.add_int64_axis("N", [256, 512, 1024, 2048]) - gemm_b.add_float64_axis("alpha", [1e-2]) - bench.run_all_benchmarks(sys.argv) From 93bc59d05c7d073a16e6f24d01ebfb4fbd92c744 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 1 Apr 2026 08:24:29 -0500 Subject: [PATCH 7/9] Renamed CUTLASS example to reflect that it uses CuteDSL --- python/examples/{cutlass_gemm.py => cute_dsl_sgemm.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename python/examples/{cutlass_gemm.py => cute_dsl_sgemm.py} (100%) diff --git a/python/examples/cutlass_gemm.py b/python/examples/cute_dsl_sgemm.py similarity index 100% rename from python/examples/cutlass_gemm.py rename to python/examples/cute_dsl_sgemm.py From 9f75642387292da7d8c75f009dbbf0238afac1ee Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 2 Apr 2026 10:29:31 -0500 Subject: [PATCH 8/9] Add patch to cutlass.base_dsl.dsl.BaseDSL to work-around a bug See https://github.com/NVIDIA/cutlass/issues/3142 --- python/examples/cute_dsl_sgemm.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/python/examples/cute_dsl_sgemm.py b/python/examples/cute_dsl_sgemm.py index 61ac2b3..eae3758 100644 --- a/python/examples/cute_dsl_sgemm.py +++ b/python/examples/cute_dsl_sgemm.py @@ -642,7 +642,24 @@ def cutlass_gemm(state: bench.State) -> None: state.exec(launcher) +def patch_cute_dsl(): + def _no_op_diagnostic(self): + return + + try: + import cutlass.base_dsl.dsl as dsl_m + + base_dsl_k = dsl_m.BaseDSL + if hasattr(base_dsl_k, "diagnostic"): + base_dsl_k.diagnostic = _no_op_diagnostic + except (ModuleNotFoundError, AttributeError): + pass + + if __name__ == "__main__": + # see https://github.com/NVIDIA/cutlass/issues/3142 + patch_cute_dsl() + gemm_b = bench.register(cutlass_gemm) gemm_b.add_int64_axis("R", [16, 64, 256]) gemm_b.add_int64_axis("N", [256, 512, 1024, 2048]) From 39730efbc34dae1ff51ec675c69c1c7396d2ace9 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 2 Apr 2026 10:37:17 -0500 Subject: [PATCH 9/9] Update requirements to reflect packages used by examples --- python/examples/requirements.txt | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/examples/requirements.txt b/python/examples/requirements.txt index 868fcc6..bc17464 100644 --- a/python/examples/requirements.txt +++ b/python/examples/requirements.txt @@ -1,8 +1,9 @@ numpy numba -cupy -nvidia-cutlass -cuda-cccl -cuda-core cuda-bindings +cuda-core numba-cuda +cuda-cccl +cupy +nvidia-cute-dsl[cu13] +torch[cu13]