From 7c60edcc0a136cf836963262a8f9aad9960d5f7b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 1 Apr 2026 08:16:04 -0500
Subject: [PATCH 1/9] cuda.core.experimental -> cuda.core

---
 python/examples/axes.py          | 2 +-
 python/examples/cpu_activity.py  | 2 +-
 python/examples/exec_tag_sync.py | 2 +-
 python/examples/skip.py          | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/examples/axes.py b/python/examples/axes.py
index ce67238..fa7c6c4 100644
--- a/python/examples/axes.py
+++ b/python/examples/axes.py
@@ -20,7 +20,7 @@ from typing import Dict, Optional, Tuple
 
 import cuda.bench as bench
 import cuda.cccl.headers as headers
-import cuda.core.experimental as core
+import cuda.core as core
 
 
 def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
diff --git a/python/examples/cpu_activity.py b/python/examples/cpu_activity.py
index a492ff7..df1efae 100644
--- a/python/examples/cpu_activity.py
+++ b/python/examples/cpu_activity.py
@@ -19,7 +19,7 @@ import time
 
 import cuda.bench as bench
 import cuda.cccl.headers as headers
-import cuda.core.experimental as core
+import cuda.core as core
 
 host_sleep_duration = 0.1
 
diff --git a/python/examples/exec_tag_sync.py b/python/examples/exec_tag_sync.py
index b9ab5ef..8d57311 100644
--- a/python/examples/exec_tag_sync.py
+++ b/python/examples/exec_tag_sync.py
@@ -20,7 +20,7 @@ from typing import Optional
 
 import cuda.bench as bench
 import cuda.cccl.headers as headers
-import cuda.core.experimental as core
+import cuda.core as core
 
 
 def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
diff --git a/python/examples/skip.py b/python/examples/skip.py
index cf7ec90..f720003 100644
--- a/python/examples/skip.py
+++ b/python/examples/skip.py
@@ -18,7 +18,7 @@ import sys
 
 import cuda.bench as bench
 import cuda.cccl.headers as headers
-import cuda.core.experimental as core
+import cuda.core as core
 
 
 def as_core_Stream(cs: bench.CudaStream) -> core.Stream:

From 974eb5ee0fa26645fa007574c68278628fd8a0e5 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 1 Apr 2026 08:17:12 -0500
Subject: [PATCH 2/9] Replace use of cupy.cuda.ExternalStream with
 cupy.cuda.Stream.from_external

---
 python/examples/cupy_extract.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/python/examples/cupy_extract.py b/python/examples/cupy_extract.py
index 091141c..d7f2a01 100644
--- a/python/examples/cupy_extract.py
+++ b/python/examples/cupy_extract.py
@@ -20,21 +20,16 @@ import cuda.bench as bench
 import cupy as cp
 
 
-def as_cp_ExternalStream(
-    cs: bench.CudaStream, dev_id: int | None = -1
-) -> cp.cuda.ExternalStream:
-    h = cs.addressof()
-    return cp.cuda.ExternalStream(h, dev_id)
+def as_cp_ExternalStream(cs: bench.CudaStream):
+    return cp.cuda.Stream.from_external(cs)
 
 
 def cupy_extract_by_mask(state: bench.State):
     n_cols = state.get_int64("numCols")
     n_rows = state.get_int64("numRows")
 
-    dev_id = state.get_device()
-    cp_s = as_cp_ExternalStream(state.get_stream(), dev_id)
+    cp_s = as_cp_ExternalStream(state.get_stream())
 
-    state.collect_cupti_metrics()
     state.add_element_count(n_rows * n_cols, "# Elements")
     int32_dt = cp.dtype(cp.int32)
     bool_dt = cp.dtype(cp.bool_)
@@ -49,7 +44,7 @@ def cupy_extract_by_mask(state: bench.State):
         _ = X[mask]
 
     def launcher(launch: bench.Launch):
-        with as_cp_ExternalStream(launch.get_stream(), dev_id):
+        with as_cp_ExternalStream(launch.get_stream()):
             _ = X[mask]
 
     state.exec(launcher, sync=True)

From d8739fc208bcbae327e8e2fa5848500baad76a84 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 1 Apr 2026 08:17:52 -0500
Subject: [PATCH 3/9] Update to cccl_cooperative_block_reduce example

---
 python/examples/cccl_cooperative_block_reduce.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/examples/cccl_cooperative_block_reduce.py b/python/examples/cccl_cooperative_block_reduce.py
index 0d5d970..c0f0138 100644
--- a/python/examples/cccl_cooperative_block_reduce.py
+++ b/python/examples/cccl_cooperative_block_reduce.py
@@ -17,7 +17,7 @@
 import sys
 
 import cuda.bench as bench
-import cuda.cccl.cooperative.experimental as coop
+import cuda.coop as coop
 import numba
 import numpy as np
 from numba import cuda

From 5bdb30f4b606779a3dadd4edad385f4e49d52443 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 1 Apr 2026 08:18:15 -0500
Subject: [PATCH 4/9] Update to cccl_parallel_segmented_reduce example per
 changes in API

Update namespace changes. Use make_segmented_reduce factory function,
and update call signatures.
---
 .../cccl_parallel_segmented_reduce.py         | 55 ++++++++-----------
 1 file changed, 22 insertions(+), 33 deletions(-)

diff --git a/python/examples/cccl_parallel_segmented_reduce.py b/python/examples/cccl_parallel_segmented_reduce.py
index e54a77b..d2140b9 100644
--- a/python/examples/cccl_parallel_segmented_reduce.py
+++ b/python/examples/cccl_parallel_segmented_reduce.py
@@ -17,36 +17,20 @@
 import sys
 
 import cuda.bench as bench
-import cuda.cccl.parallel.experimental.algorithms as algorithms
-import cuda.cccl.parallel.experimental.iterators as iterators
-import cuda.core.experimental as core
+import cuda.compute.algorithms as algorithms
+import cuda.compute.iterators as iterators
+import cuda.core as core
 import cupy as cp
 import numpy as np
-
-
-class CCCLStream:
-    "Class to work around https://github.com/NVIDIA/cccl/issues/5144"
-
-    def __init__(self, ptr):
-        self._ptr = ptr
-
-    def __cuda_stream__(self):
-        return (0, self._ptr)
+from cuda.compute import OpKind
 
 
 def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
     return core.Stream.from_handle(cs.addressof())
 
 
-def as_cccl_Stream(cs: bench.CudaStream) -> CCCLStream:
-    return CCCLStream(cs.addressof())
-
-
-def as_cp_ExternalStream(
-    cs: bench.CudaStream, dev_id: int | None = -1
-) -> cp.cuda.ExternalStream:
-    h = cs.addressof()
-    return cp.cuda.ExternalStream(h, dev_id)
+def as_cp_ExternalStream(cs: bench.CudaStream) -> cp.cuda.ExternalStream:
+    return cp.cuda.Stream.from_external(cs)
 
 
 def segmented_reduce(state: bench.State):
@@ -56,13 +40,8 @@ def segmented_reduce(state: bench.State):
     n_rows = n_elems // n_cols
 
     state.add_summary("numRows", n_rows)
-    state.collect_cupti_metrics()
 
-    dev_id = state.get_device()
-    cp_stream = as_cp_ExternalStream(state.get_stream(), dev_id)
-
-    def add_op(a, b):
-        return a + b
+    cp_stream = as_cp_ExternalStream(state.get_stream())
 
     def make_scaler(step):
         def scale(row_id):
@@ -85,15 +64,24 @@ def segmented_reduce(state: bench.State):
         d_input = mat
         d_output = cp.empty(n_rows, dtype=d_input.dtype)
 
-    alg = algorithms.segmented_reduce(
+    add_op = OpKind.PLUS
+
+    alg = algorithms.make_segmented_reduce(
         d_input, d_output, start_offsets, end_offsets, add_op, h_init
     )
 
-    cccl_stream = as_cccl_Stream(state.get_stream())
-
+    cccl_stream = state.get_stream()
     # query size of temporary storage and allocate
     temp_nbytes = alg(
-        None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init, cccl_stream
+        None,
+        d_input,
+        d_output,
+        add_op,
+        n_rows,
+        start_offsets,
+        end_offsets,
+        h_init,
+        cccl_stream,
     )
     h_init = np.zeros(tuple(), dtype=np.int32)
 
@@ -101,11 +89,12 @@ def segmented_reduce(state: bench.State):
         temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
 
     def launcher(launch: bench.Launch):
-        s = as_cccl_Stream(launch.get_stream())
+        s = launch.get_stream()
         alg(
             temp_storage,
             d_input,
             d_output,
+            add_op,
             n_rows,
             start_offsets,
             end_offsets,

From 3f284b4004b44cc05609a5f48f0a527bb32f613f Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 1 Apr 2026 08:20:20 -0500
Subject: [PATCH 5/9] Renamed cccl_* examples

cccl_parallel_* -> cuda_compute_*
cccl_cooperative_* -> cuda_coop_*
---
 ...allel_segmented_reduce.py => cuda_compute_segmented_reduce.py} | 0
 ...cccl_cooperative_block_reduce.py => cuda_coop_block_reduce.py} | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename python/examples/{cccl_parallel_segmented_reduce.py => cuda_compute_segmented_reduce.py} (100%)
 rename python/examples/{cccl_cooperative_block_reduce.py => cuda_coop_block_reduce.py} (100%)

diff --git a/python/examples/cccl_parallel_segmented_reduce.py b/python/examples/cuda_compute_segmented_reduce.py
similarity index 100%
rename from python/examples/cccl_parallel_segmented_reduce.py
rename to python/examples/cuda_compute_segmented_reduce.py
diff --git a/python/examples/cccl_cooperative_block_reduce.py b/python/examples/cuda_coop_block_reduce.py
similarity index 100%
rename from python/examples/cccl_cooperative_block_reduce.py
rename to python/examples/cuda_coop_block_reduce.py

From e4cfddeb878577cf2dd2b6bb0658603ab7592839 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 1 Apr 2026 08:23:41 -0500
Subject: [PATCH 6/9] Rewrote cutlass_gemm example to use CuteDSL

---
 python/examples/cutlass_gemm.py | 603 ++++++++++++++++++++++++++++++--
 1 file changed, 570 insertions(+), 33 deletions(-)

diff --git a/python/examples/cutlass_gemm.py b/python/examples/cutlass_gemm.py
index cd62f39..61ac2b3 100644
--- a/python/examples/cutlass_gemm.py
+++ b/python/examples/cutlass_gemm.py
@@ -16,13 +16,18 @@
 
 
 import sys
+from typing import Tuple
 
 import cuda.bench as bench
 import cuda.bindings.driver as driver
-import cuda.core.experimental as core
+import cuda.core as core
 import cupy as cp
 import cutlass
+import cutlass.cute as cute
+import cutlass.pipeline as pipeline
+import cutlass.utils as utils
 import numpy as np
+from cutlass.cute.runtime import from_dlpack
 
 
 def as_bindings_Stream(cs: bench.CudaStream) -> driver.CUstream:
@@ -33,31 +38,578 @@ def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
     return core.Stream.from_handle(cs.addressof())
 
 
-def make_cp_array(
-    arr_h: np.ndarray, dev_buf: core.Buffer, dev_id: int | None
-) -> cp.ndarray:
+def make_tensor(arr_h: np.ndarray, dev_buf: core.Buffer, dev_id: int | None):
     cp_memview = cp.cuda.UnownedMemory(
         int(dev_buf.handle), dev_buf.size, dev_buf, -1 if dev_id is None else dev_id
     )
     zero_offset = 0
-    return cp.ndarray(
-        arr_h.shape,
-        dtype=arr_h.dtype,
-        memptr=cp.cuda.MemoryPointer(cp_memview, zero_offset),
+    return from_dlpack(
+        cp.ndarray(
+            arr_h.shape,
+            dtype=arr_h.dtype,
+            memptr=cp.cuda.MemoryPointer(cp_memview, zero_offset),
+        ),
+        assumed_align=16,
     )
 
 
+class SGemm:
+    """
+    Adapted from https://github.com/NVIDIA/cutlass/blob/main/examples/python/CuTeDSL/ampere/sgemm.py
+    """
+
+    def __init__(
+        self,
+        cta_tiler: Tuple[int, int, int] = (128, 128, 8),
+        num_stages: int = 3,
+        num_threads: int = 256,
+    ):
+        self._cta_tiler = cta_tiler
+        self._num_stages = num_stages
+        self._num_threads = num_threads
+        assert num_threads > 0, "needs at least one thread"
+        assert num_threads % 16 == 0, "multiples of 16 required for MMA thread layout"
+
+        self._bM, self._bN, self._bK = self._cta_tiler
+        assert self._bM % 16 == 0, "multiple of 16 required for tile dimension M"
+        assert self._bN % 16 == 0, "multiple of 16 required for tile dimension N"
+        assert self._num_stages >= 3, "num_stages must be greater than or equal to 3"
+        self.cta_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=1, num_threads=num_threads
+        )
+
+    @cute.jit
+    def __call__(
+        self,
+        mA: cute.Tensor,
+        mB: cute.Tensor,
+        mC: cute.Tensor,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+        stream: driver.CUstream = driver.CUstream(
+            driver.CUstream_flags.CU_STREAM_DEFAULT
+        ),
+    ):
+        self.a_major_mode = utils.LayoutEnum.from_tensor(mA)
+        self.b_major_mode = utils.LayoutEnum.from_tensor(mB)
+        self.c_major_mode = utils.LayoutEnum.from_tensor(mC)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Create layouts for shared memory for A and B:
+        #   - sA/sB is m/n-major to vectorized copies from shared
+        #       memory to registers. This is because the MMA layouts
+        #       for sA/sB are also m/n-major
+        #   - When gA/gB is k-major, pad 4 elements to reduce bank conflicts
+        # ///////////////////////////////////////////////////////////////////////////////
+
+        padding_a = 4 if self.a_major_mode == utils.LayoutEnum.ROW_MAJOR else 0
+        padding_b = 4 if self.b_major_mode == utils.LayoutEnum.ROW_MAJOR else 0
+        sA_layout = cute.make_layout(
+            (self._bM, self._bK, self._num_stages),
+            stride=(1, (self._bM + padding_a), self._bK * (self._bM + padding_a)),
+        )
+        sB_layout = cute.make_layout(
+            (self._bN, self._bK, self._num_stages),
+            stride=(1, (self._bN + padding_b), self._bK * (self._bN + padding_b)),
+        )
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Create copy layouts that will be used for asynchronous
+        # global memory -> shared memory copies:
+        #   - The majorness of tA/tB follows the majorness of gA/gB
+        #   - For k-major, these layouts will copy values one-by-one from
+        #       from global memory, without vectorizing
+        #   - For m/n-major, it will vectorize to a 128bit copy for faster
+        #       data transfer between global and shared memory, as long
+        #       as the alignment of the tensor allows it. Otherwise, it
+        #       defaults to a non-vectorized copy
+        # ///////////////////////////////////////////////////////////////////////////////
+
+        tA = cute.make_layout(
+            (self._num_threads // self._bK, self._bK), stride=(self._bK, 1)
+        )
+        tB = cute.make_layout(
+            (self._num_threads // self._bK, self._bK), stride=(self._bK, 1)
+        )
+        vA = cute.make_layout((1, 1))
+        vB = cute.make_layout((1, 1))
+        atom_async_copy_A = cute.make_copy_atom(
+            cute.nvgpu.cpasync.CopyG2SOp(),
+            mA.element_type,
+            num_bits_per_copy=mA.element_type.width,
+        )
+        atom_async_copy_B = cute.make_copy_atom(
+            cute.nvgpu.cpasync.CopyG2SOp(),
+            mA.element_type,
+            num_bits_per_copy=mB.element_type.width,
+        )
+        if cutlass.const_expr(self.a_major_mode == utils.LayoutEnum.COL_MAJOR):
+            num_vectorized = 4 if (mA.layout[0].max_alignment % 16 == 0) else 1
+            atom_async_copy_A = cute.make_copy_atom(
+                cute.nvgpu.cpasync.CopyG2SOp(),
+                mA.element_type,
+                num_bits_per_copy=mA.element_type.width * num_vectorized,
+            )
+            major_mode_size = self._bM // num_vectorized
+            tA = cute.make_layout(
+                (major_mode_size, self._num_threads // major_mode_size),
+                stride=(1, major_mode_size),
+            )
+            vA = cute.make_layout((num_vectorized, 1))
+
+        if cutlass.const_expr(self.b_major_mode == utils.LayoutEnum.COL_MAJOR):
+            num_vectorized = 4 if (mB.layout[0].max_alignment % 16 == 0) else 1
+            atom_async_copy_B = cute.make_copy_atom(
+                cute.nvgpu.cpasync.CopyG2SOp(),
+                mA.element_type,
+                num_bits_per_copy=mB.element_type.width * num_vectorized,
+            )
+            major_mode_size = self._bN // num_vectorized
+            tB = cute.make_layout(
+                (major_mode_size, self._num_threads // major_mode_size),
+                stride=(1, major_mode_size),
+            )
+            vB = cute.make_layout((num_vectorized, 1))
+
+        tiled_copy_A = cute.make_tiled_copy_tv(atom_async_copy_A, tA, vA)
+        tiled_copy_B = cute.make_tiled_copy_tv(atom_async_copy_B, tB, vB)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Create layouts for GEMM:
+        # We tile an MMA atom across a tensor. `atoms_layout` is the layout
+        # of atoms in the tiled MMA. (Because we use an `MmaUniversalOp`,
+        # which has a trivial 1x1x1 MMA trait, `atoms_layout` is also
+        # simply the thread layout for C.) `permutation_tiler` reorders the
+        # elements of the tensor that the tiled MMA is applied to.
+        # Different combinations of `atoms_layout` and `permutation_tiler`
+        # values can create different MMA thread-value patterns.
+        #
+        # Here, the MMA layout is set so that each thread copies four
+        # consecutive elements from shared memory to registers.
+        # `permutation_tiler_M/N` maps the elements handled by each thread
+        # to the permuted element in the tensor.
+        # For increasing indices in the tensor, the thread ID that reads it is:
+        #   - (without permutation) ==>
+        #      0 1 2 ... 15 0 1 2 ... 15 0 1 2 ... 15 0 1 2 ... 15 ......
+        #   - (with permutation) ==>
+        #      0 0 0 0 1 1 1 1 2 2 2 2 ... 15 15 15 15 0 0 0 0 1 1 1 1 ......
+        # ///////////////////////////////////////////////////////////////////////////////
+        atoms_layout = cute.make_layout(
+            (self._num_threads // 16, 16, 1), stride=(16, 1, 0)
+        )
+        if cutlass.const_expr(self.c_major_mode == utils.LayoutEnum.COL_MAJOR):
+            atoms_layout = cute.make_layout(
+                (16, self._num_threads // 16, 1), stride=(1, 16, 0)
+            )
+        op = cute.nvgpu.MmaUniversalOp(cutlass.Float32)
+        permutation_tiler_M = cute.make_layout(
+            (atoms_layout.shape[0], 4), stride=(4, 1)
+        )
+        permutation_tiler_N = cute.make_layout(
+            (atoms_layout.shape[1], 4), stride=(4, 1)
+        )
+        tiled_mma = cute.make_tiled_mma(
+            op,
+            atoms_layout,
+            permutation_mnk=(permutation_tiler_M, permutation_tiler_N, None),
+        )
+
+        # grid_dim: ((m + BLK_M - 1) // BLK_M, (n + BLK_N - 1) // BLK_N, 1)
+        grid_dim = *cute.ceil_div(mC.shape, (self._bM, self._bN)), 1
+
+        self.kernel(
+            mA,
+            mB,
+            mC,
+            sA_layout,
+            sB_layout,
+            tiled_copy_A,
+            tiled_copy_B,
+            tiled_mma,
+            epilogue_op,
+        ).launch(
+            grid=grid_dim,
+            block=[cute.size(atoms_layout), 1, 1],
+            stream=stream,
+        )
+
+    @cute.kernel
+    def kernel(
+        self,
+        mA: cute.Tensor,
+        mB: cute.Tensor,
+        mC: cute.Tensor,
+        sA_layout: cute.Layout,
+        sB_layout: cute.Layout,
+        tiled_copy_A: cute.TiledCopy,
+        tiled_copy_B: cute.TiledCopy,
+        tiled_mma: cute.TiledMma,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        # Thread and block indices
+        tidx, tidy, tidz = cute.arch.thread_idx()
+        bidx, bidy, bidz = cute.arch.block_idx()
+        tiler_coord = (bidx, bidy, None)
+        thr_mma = tiled_mma.get_slice(tidx)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Get the appropriate tiles for this thread block.
+        # gA: (BLK_M, BLK_K, k), gB: (BLK_N, BLK_K, k), gC: (BLK_M, BLK_N)
+        # ///////////////////////////////////////////////////////////////////////////////
+        gA = cute.local_tile(
+            mA, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, None, 1)
+        )
+        gB = cute.local_tile(
+            mB, tiler=self._cta_tiler, coord=tiler_coord, proj=(None, 1, 1)
+        )
+        gC = cute.local_tile(
+            mC, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, 1, None)
+        )
+
+        # Move the pointer of gA/gB in the `-k`` direction, making the first
+        # tile (instead of the last one) irregular in shape when k is irregular.
+        # We first handle the irregular tile to avoid checking for this
+        # condition within the mainloop.
+        residue_k = mA.shape[1] - self._bK * gA.shape[2]
+        gA = cute.domain_offset((0, residue_k, 0), gA)
+        gB = cute.domain_offset((0, residue_k, 0), gB)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Get the appropriate tiles for this thread.
+        # sA:   (BLK_M, BLK_K, PIPE)       , sB:   (BLK_N, BLK_K, PIPE)
+        # tAgA: (CPY, CPY_M, CPY_K, k)     , tBgB: (CPY, CPY_N, CPY_K, k)
+        # tAsA: (CPY, CPY_M, CPY_K, PIPE)  , tBsB: (CPY, CPY_N, CPY_K, PIPE)
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Create shared memory buffer
+        smem = cutlass.utils.SmemAllocator()
+        sA = smem.allocate_tensor(mA.element_type, sA_layout, 16)
+        sB = smem.allocate_tensor(mB.element_type, sB_layout, 16)
+        thr_copy_A = tiled_copy_A.get_slice(tidx)
+        thr_copy_B = tiled_copy_B.get_slice(tidx)
+        tAgA = thr_copy_A.partition_S(gA)
+        tAsA = thr_copy_A.partition_D(sA)
+        tBgB = thr_copy_B.partition_S(gB)
+        tBsB = thr_copy_B.partition_D(sB)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Predicate: Mark indices that need to copy when the problem shape
+        # isn't a multiple of the tile shape. If tApA/B[i] is 0, then do not
+        # do the copy atom associated with index i.
+        # cA:    (BLK_M, BLK_K)      => (blk_m, blk_k)
+        # cB:    (BLK_N, BLK_K)      => (blk_n, blk_k)
+        # tAcA:  (CPY, CPY_M, CPY_K) => (blk_m, blk_k)
+        # tBcB:  (CPY, CPY_N, CPY_K) => (blk_n, blk_k)
+        # tApA: (rest_v, CPY_M, CPY_K), stride=(..., ..., 0)
+        # tBpB: (rest_v, CPY_N, CPY_K), stride=(..., ..., 0)
+        # CPY =  (atom_v, rest_v)
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Construct identity layout for sA and sB, used for predication
+        mcA = cute.make_identity_tensor(mA.shape)
+        mcB = cute.make_identity_tensor(mB.shape)
+        cA = cute.local_tile(
+            mcA, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, None, 1)
+        )
+        cB = cute.local_tile(
+            mcB, tiler=self._cta_tiler, coord=tiler_coord, proj=(None, 1, 1)
+        )
+        cA = cute.domain_offset((0, residue_k, 0), cA)
+        cB = cute.domain_offset((0, residue_k, 0), cB)
+        # Repeat the partitioning with identity layouts
+        tAcA = thr_copy_A.partition_S(cA)
+        tBcB = thr_copy_B.partition_S(cB)
+        # Allocate predicate tensors for m and n
+        tApA = cute.make_rmem_tensor(
+            cute.make_layout(
+                (
+                    tAsA.shape[0][1],
+                    cute.size(tAsA, mode=[1]),
+                    cute.size(tAsA, mode=[2]),
+                ),
+                stride=(cute.size(tAsA, mode=[1]), 1, 0),
+            ),
+            cutlass.Boolean,
+        )
+        tBpB = cute.make_rmem_tensor(
+            cute.make_layout(
+                (
+                    tBsB.shape[0][1],
+                    cute.size(tBsB, mode=[1]),
+                    cute.size(tBsB, mode=[2]),
+                ),
+                stride=(cute.size(tBsB, mode=[1]), 1, 0),
+            ),
+            cutlass.Boolean,
+        )
+        # Allocate predicate tensors for m, n and k for residue k-tile
+        tApA_residue_k = cute.make_rmem_tensor(
+            cute.make_layout(
+                (
+                    tAsA.shape[0][1],
+                    cute.size(tAsA, mode=[1]),
+                    cute.size(tAsA, mode=[2]),
+                ),
+                stride=(
+                    cute.size(tAsA, mode=[1]) * cute.size(tAsA, mode=[2]),
+                    cute.size(tAsA, mode=[2]),
+                    1,
+                ),
+            ),
+            cutlass.Boolean,
+        )
+        tBpB_residue_k = cute.make_rmem_tensor(
+            cute.make_layout(
+                (
+                    tBsB.shape[0][1],
+                    cute.size(tBsB, mode=[1]),
+                    cute.size(tBsB, mode=[2]),
+                ),
+                stride=(
+                    cute.size(tBsB, mode=[1]) * cute.size(tBsB, mode=[2]),
+                    cute.size(tBsB, mode=[2]),
+                    1,
+                ),
+            ),
+            cutlass.Boolean,
+        )
+        # Set predicates for m/n bounds for mainloop
+        for rest_v in range(tApA.shape[0]):
+            for m in range(tApA.shape[1]):
+                tApA[rest_v, m, 0] = cute.elem_less(
+                    tAcA[(0, rest_v), m, 0, 0][0], mA.shape[0]
+                )
+        for rest_v in range(tBpB.shape[0]):
+            for n in range(tBpB.shape[1]):
+                tBpB[rest_v, n, 0] = cute.elem_less(
+                    tBcB[(0, rest_v), n, 0, 0][0], mB.shape[0]
+                )
+
+        # Set predicates for m/n/k bounds for residue k tile
+        for rest_v in range(tApA_residue_k.shape[0]):
+            for m in range(tApA_residue_k.shape[1]):
+                for k in range(tApA_residue_k.shape[2]):
+                    coord_A = tAcA[(0, rest_v), m, k, 0]
+                    tApA_residue_k[rest_v, m, k] = cute.elem_less(
+                        (coord_A[0], cutlass.Int32(-1)), (mA.shape[0], coord_A[1])
+                    )
+        for rest_v in range(tBpB_residue_k.shape[0]):
+            for n in range(tBpB_residue_k.shape[1]):
+                for k in range(tBpB_residue_k.shape[2]):
+                    coord_B = tBcB[(0, rest_v), n, k, 0]
+                    tBpB_residue_k[rest_v, n, k] = cute.elem_less(
+                        (coord_B[0], cutlass.Int32(-1)), (mB.shape[0], coord_B[1])
+                    )
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Prefetch Prologue
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Start async loads for 0th k-tile, where we take care of the k-residue
+        k_pipe_max = cute.size(tAsA, mode=[3])
+        k_tile_count = cute.size(tAgA, mode=[3])
+        gmem_pipe_read = cutlass.Int32(0)
+        cute.copy(
+            tiled_copy_A,
+            tAgA[None, None, None, gmem_pipe_read],
+            tAsA[None, None, None, 0],
+            pred=tApA_residue_k,
+        )
+        cute.copy(
+            tiled_copy_B,
+            tBgB[None, None, None, gmem_pipe_read],
+            tBsB[None, None, None, 0],
+            pred=tBpB_residue_k,
+        )
+        cute.arch.cp_async_commit_group()
+        gmem_pipe_read = (
+            gmem_pipe_read + 1
+            if gmem_pipe_read + 1 < k_tile_count
+            else cutlass.Int32(0)
+        )
+        # Start async loads for 1st k-tile onwards, no k-residue handling needed
+        for k_tile in range(1, k_pipe_max - 1):
+            if k_tile < k_tile_count:
+                cute.copy(
+                    tiled_copy_A,
+                    tAgA[None, None, None, gmem_pipe_read],
+                    tAsA[None, None, None, k_tile],
+                    pred=tApA,
+                )
+                cute.copy(
+                    tiled_copy_B,
+                    tBgB[None, None, None, gmem_pipe_read],
+                    tBsB[None, None, None, k_tile],
+                    pred=tBpB,
+                )
+
+            gmem_pipe_read = (
+                gmem_pipe_read + 1
+                if gmem_pipe_read + 1 < k_tile_count
+                else cutlass.Int32(0)
+            )
+            cute.arch.cp_async_commit_group()
+
+        # all tiles have been copied from global memory, so clear the
+        # predicate tensor
+        if k_tile_count < k_pipe_max:
+            for rest_v in range(tApA.shape[0]):
+                for m in range(tApA.shape[1]):
+                    tApA[rest_v, m, 0] = cutlass.Boolean(0)
+            for rest_v in range(tBpB.shape[0]):
+                for n in range(tBpB.shape[1]):
+                    tBpB[rest_v, n, 0] = cutlass.Boolean(0)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Define A/B partitioning and C accumulators.
+        # ///////////////////////////////////////////////////////////////////////////////
+        tCsA = thr_mma.partition_A(sA)
+        tCsB = thr_mma.partition_B(sB)
+        tCgC = thr_mma.partition_C(gC)
+        tCrA = tiled_mma.make_fragment_A(tCsA[None, None, None, 0])
+        tCrB = tiled_mma.make_fragment_B(tCsB[None, None, None, 0])
+        tCrC = tiled_mma.make_fragment_C(tCgC)
+        # Clear the accumulator
+        tCrC.fill(0.0)
+
+        # Current pipe index in smem to read from / write to
+        smem_pipe_read = cutlass.Int32(0)
+        smem_pipe_write = cutlass.Int32(k_pipe_max - 1)
+
+        tCsA_p = tCsA[None, None, None, smem_pipe_read]
+        tCsB_p = tCsB[None, None, None, smem_pipe_read]
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # PREFETCH register pipeline
+        # ///////////////////////////////////////////////////////////////////////////////
+        k_block_max = cute.size(tCrA, mode=[2])
+
+        if k_block_max > 1:
+            # Wait until our first prefetched tile is loaded in
+            cute.arch.cp_async_wait_group(k_pipe_max - 2)
+            self.cta_sync_barrier.arrive_and_wait()
+            # Prefetch the first rmem from the first k-tile
+            cute.autovec_copy(tCsA_p[None, None, 0], tCrA[None, None, 0])
+            cute.autovec_copy(tCsB_p[None, None, 0], tCrB[None, None, 0])
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Mainloop
+        # 1. Shared memory pipeline (gmem -> smem):
+        #    The default smem pipeline depth is 3, meaning that for shared
+        # memory buffers, we allocate three times the size described by the
+        # CTA tiler. We prefetch 2 of these buffers before entering the main
+        # loop. Considering only the transfer from global memory to shared
+        # memory, the general structure of the mainloop is:
+        #   (1) copy k-tile from gmem to smem;
+        #   (2) perform gemm computation on k-tile;
+        #   (3) wait for the next copy to finish.
+        #    The `cute.arch.cp_async_wait_group(num_smem_stages - 2)` command
+        # waits for the number of unfinished 'copy' to be <= 1. The advantage
+        # of this approach is that it allows for simultaneous production
+        # (i.e., step (1)) and consumption (i.e., step (2)) of smem.
+        #    A common misconception is to prefetch N buffers and rewrite
+        # the pipeline logic to wait on N-1 pending copies. The disadvantage
+        # of this approach is that it requires fully consuming a buffer in
+        # order to open an empty buffer for the next copy.
+        # 2. Register pipeline (smem -> register):
+        #    Similarly, the register pipeline produces i+1, consumes i, and
+        # produces i+2... Notably, i and i+1 do not use the same register,
+        # eliminating dependencies on the same register for better parallelism.
+        # 3. Combining the smem and register pipelines results in the mainloop.
+        # ///////////////////////////////////////////////////////////////////////////////
+
+        for _ in range(k_tile_count):
+            for k_block in range(k_block_max, unroll_full=True):
+                if k_block == k_block_max - 1:
+                    tCsA_p = tCsA[None, None, None, smem_pipe_read]
+                    tCsB_p = tCsB[None, None, None, smem_pipe_read]
+                    cute.arch.cp_async_wait_group(k_pipe_max - 2)
+                    self.cta_sync_barrier.arrive_and_wait()
+
+                # Load A, B from shared memory to registers for k_block + 1
+                k_block_next = (k_block + 1) % k_block_max  # static
+                cute.autovec_copy(
+                    tCsA_p[None, None, k_block_next],
+                    tCrA[None, None, k_block_next],
+                )
+                cute.autovec_copy(
+                    tCsB_p[None, None, k_block_next],
+                    tCrB[None, None, k_block_next],
+                )
+
+                # Fetch next A: To better interleave global memory access and
+                # compute instructions, we intentionally use the sequence:
+                # copy A, perform GEMM, then copy B.
+                if k_block == 0:
+                    cute.copy(
+                        tiled_copy_A,
+                        tAgA[None, None, None, gmem_pipe_read],
+                        tAsA[None, None, None, smem_pipe_write],
+                        # Use predicates because the m-mode may be irregular
+                        pred=tApA,
+                    )
+
+                # Thread-level register gemm for k_block
+                cute.gemm(
+                    tiled_mma,
+                    tCrC,
+                    tCrA[None, None, k_block],
+                    tCrB[None, None, k_block],
+                    tCrC,
+                )
+
+                # Fetch next B and update smem pipeline read/write
+                if k_block == 0:
+                    cute.copy(
+                        tiled_copy_B,
+                        tBgB[None, None, None, gmem_pipe_read],
+                        tBsB[None, None, None, smem_pipe_write],
+                        # Use predicates because the n-mode may be irregular
+                        pred=tBpB,
+                    )
+                    cute.arch.cp_async_commit_group()
+                    smem_pipe_write = smem_pipe_read
+                    smem_pipe_read = smem_pipe_read + 1
+                    if smem_pipe_read == k_pipe_max:
+                        smem_pipe_read = cutlass.Int32(0)
+                    # After copying all tiles, we avoid clearing the predicate
+                    # tensor in the `mainloop` to prevent increasing its
+                    # instruction count. Instead, we continue copying the
+                    # first tile, though it won't be used. The 0-th tile is not
+                    # copied due to its irregular shape, which could lead to
+                    # illegal memory accesses.
+                    gmem_pipe_read = (
+                        gmem_pipe_read + 1
+                        if gmem_pipe_read + 1 < k_tile_count
+                        else cutlass.Int32(1)
+                    )
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        # Epilogue
+        # Applies the epilogue operation to the accumulated results and copies
+        # them without vectorization.
+        # ///////////////////////////////////////////////////////////////////////////////
+        cute.arch.cp_async_wait_group(0)
+        self.cta_sync_barrier.arrive_and_wait()
+        tCrC.store(epilogue_op(tCrC.load()))
+
+        # predicate
+        cC = cute.make_identity_tensor(gC.shape)
+        tCpC = thr_mma.partition_C(cC)
+        predC = cute.make_rmem_tensor(tCrC.layout, cutlass.Boolean)
+        residue_m = mC.shape[0] - cutlass.Int32(self._bM) * bidx
+        residue_n = mC.shape[1] - cutlass.Int32(self._bN) * bidy
+        for i in range(cute.size(tCrC.shape)):
+            predC[i] = cute.elem_less(tCpC[i], (residue_m, residue_n))
+        atom = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), mC.element_type)
+        cute.copy(atom, tCrC, tCgC, pred=predC)
+        return
+
+
 def cutlass_gemm(state: bench.State) -> None:
     n = state.get_int64("N")
     r = state.get_int64("R")
 
-    alpha = state.get_float64("alpha")
-
-    dt = np.float64
+    dt = np.float32
     A_h = np.random.randn(n, r).astype(dt)
     B_h = np.copy(A_h.mT)
     C_h = np.eye(n, dtype=dt)
-    D_h = np.zeros_like(C_h)
 
     if n >= 1024:
         # allow more time for large inputs
@@ -71,34 +623,21 @@ def cutlass_gemm(state: bench.State) -> None:
     A_d = core.DeviceMemoryResource(dev_id).allocate(A_h.nbytes, core_s)
     B_d = core.DeviceMemoryResource(dev_id).allocate(B_h.nbytes, core_s)
     C_d = core.DeviceMemoryResource(dev_id).allocate(C_h.nbytes, core_s)
-    D_d = core.DeviceMemoryResource(dev_id).allocate(D_h.nbytes, core_s)
 
     driver.cuMemcpyAsync(A_d.handle, A_h.ctypes.data, A_h.nbytes, s)
     driver.cuMemcpyAsync(B_d.handle, B_h.ctypes.data, B_h.nbytes, s)
     driver.cuMemcpyAsync(C_d.handle, C_h.ctypes.data, C_h.nbytes, s)
-    driver.cuMemcpyAsync(D_d.handle, D_h.ctypes.data, D_h.nbytes, s)
 
-    A_cp = make_cp_array(A_h, A_d, dev_id)
-    B_cp = make_cp_array(B_h, B_d, dev_id)
-    C_cp = make_cp_array(C_h, C_d, dev_id)
-    D_cp = make_cp_array(D_h, D_d, dev_id)
+    A_cp = make_tensor(A_h, A_d, dev_id)
+    B_cp = make_tensor(B_h, B_d, dev_id)
+    C_cp = make_tensor(C_h, C_d, dev_id)
 
-    plan = cutlass.op.Gemm(
-        A=A_cp,
-        B=B_cp,
-        C=C_cp,
-        D=D_cp,
-        element=dt,
-        alpha=alpha,
-        beta=1,
-        layout=cutlass.LayoutType.RowMajor,
-    )
-    # warm-up to ensure compilation is not timed
-    plan.run(stream=s)
+    sgemm = SGemm()
+    _ = sgemm(A_cp, B_cp, C_cp, stream=s)
 
     def launcher(launch: bench.Launch) -> None:
         s = as_bindings_Stream(launch.get_stream())
-        plan.run(stream=s, sync=False)
+        sgemm(A_cp, B_cp, C_cp, stream=s)
 
     state.exec(launcher)
 
@@ -108,6 +647,4 @@ if __name__ == "__main__":
     gemm_b.add_int64_axis("R", [16, 64, 256])
     gemm_b.add_int64_axis("N", [256, 512, 1024, 2048])
 
-    gemm_b.add_float64_axis("alpha", [1e-2])
-
     bench.run_all_benchmarks(sys.argv)

From 93bc59d05c7d073a16e6f24d01ebfb4fbd92c744 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 1 Apr 2026 08:24:29 -0500
Subject: [PATCH 7/9] Renamed CUTLASS example to reflect that it uses CuteDSL

---
 python/examples/{cutlass_gemm.py => cute_dsl_sgemm.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename python/examples/{cutlass_gemm.py => cute_dsl_sgemm.py} (100%)

diff --git a/python/examples/cutlass_gemm.py b/python/examples/cute_dsl_sgemm.py
similarity index 100%
rename from python/examples/cutlass_gemm.py
rename to python/examples/cute_dsl_sgemm.py

From 9f75642387292da7d8c75f009dbbf0238afac1ee Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 2 Apr 2026 10:29:31 -0500
Subject: [PATCH 8/9] Add patch to cutlass.base_dsl.dsl.BaseDSL to work-around
 a bug

See https://github.com/NVIDIA/cutlass/issues/3142
---
 python/examples/cute_dsl_sgemm.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/python/examples/cute_dsl_sgemm.py b/python/examples/cute_dsl_sgemm.py
index 61ac2b3..eae3758 100644
--- a/python/examples/cute_dsl_sgemm.py
+++ b/python/examples/cute_dsl_sgemm.py
@@ -642,7 +642,24 @@ def cutlass_gemm(state: bench.State) -> None:
     state.exec(launcher)
 
 
+def patch_cute_dsl():
+    def _no_op_diagnostic(self):
+        return
+
+    try:
+        import cutlass.base_dsl.dsl as dsl_m
+
+        base_dsl_k = dsl_m.BaseDSL
+        if hasattr(base_dsl_k, "diagnostic"):
+            base_dsl_k.diagnostic = _no_op_diagnostic
+    except (ModuleNotFoundError, AttributeError):
+        pass
+
+
 if __name__ == "__main__":
+    # see https://github.com/NVIDIA/cutlass/issues/3142
+    patch_cute_dsl()
+
     gemm_b = bench.register(cutlass_gemm)
     gemm_b.add_int64_axis("R", [16, 64, 256])
     gemm_b.add_int64_axis("N", [256, 512, 1024, 2048])

From 39730efbc34dae1ff51ec675c69c1c7396d2ace9 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 2 Apr 2026 10:37:17 -0500
Subject: [PATCH 9/9] Update requirements to reflect packages used by examples

---
 python/examples/requirements.txt | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/examples/requirements.txt b/python/examples/requirements.txt
index 868fcc6..bc17464 100644
--- a/python/examples/requirements.txt
+++ b/python/examples/requirements.txt
@@ -1,8 +1,9 @@
 numpy
 numba
-cupy
-nvidia-cutlass
-cuda-cccl
-cuda-core
 cuda-bindings
+cuda-core
 numba-cuda
+cuda-cccl
+cupy
+nvidia-cute-dsl[cu13]
+torch[cu13]