# Copyright 2025 NVIDIA Corporation
#
#  Licensed under the Apache License, Version 2.0 with the LLVM exception
#  (the "License"); you may not use this file except in compliance with
#  the License.
#
#  You may obtain a copy of the License at
#
#      http://llvm.org/foundation/relicensing/LICENSE.txt
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.


import sys
from typing import Tuple

import cuda.bench as bench
import cuda.bindings.driver as driver
import cuda.core as core
import cupy as cp
import cutlass
import cutlass.cute as cute
import cutlass.pipeline as pipeline
import cutlass.utils as utils
import numpy as np
from cutlass.cute.runtime import from_dlpack


def as_bindings_Stream(cs: bench.CudaStream) -> driver.CUstream:
    return driver.CUstream(cs.addressof())


def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
    return core.Stream.from_handle(cs.addressof())


def make_tensor(arr_h: np.ndarray, dev_buf: core.Buffer, dev_id: int | None):
    cp_memview = cp.cuda.UnownedMemory(
        int(dev_buf.handle), dev_buf.size, dev_buf, -1 if dev_id is None else dev_id
    )
    zero_offset = 0
    return from_dlpack(
        cp.ndarray(
            arr_h.shape,
            dtype=arr_h.dtype,
            memptr=cp.cuda.MemoryPointer(cp_memview, zero_offset),
        ),
        assumed_align=16,
    )


class SGemm:
    """
    Adapted from https://github.com/NVIDIA/cutlass/blob/main/examples/python/CuTeDSL/ampere/sgemm.py
    """

    def __init__(
        self,
        cta_tiler: Tuple[int, int, int] = (128, 128, 8),
        num_stages: int = 3,
        num_threads: int = 256,
    ):
        self._cta_tiler = cta_tiler
        self._num_stages = num_stages
        self._num_threads = num_threads
        assert num_threads > 0, "needs at least one thread"
        assert num_threads % 16 == 0, "multiples of 16 required for MMA thread layout"

        self._bM, self._bN, self._bK = self._cta_tiler
        assert self._bM % 16 == 0, "multiple of 16 required for tile dimension M"
        assert self._bN % 16 == 0, "multiple of 16 required for tile dimension N"
        assert self._num_stages >= 3, "num_stages must be greater than or equal to 3"
        self.cta_sync_barrier = pipeline.NamedBarrier(
            barrier_id=1, num_threads=num_threads
        )

    @cute.jit
    def __call__(
        self,
        mA: cute.Tensor,
        mB: cute.Tensor,
        mC: cute.Tensor,
        epilogue_op: cutlass.Constexpr = lambda x: x,
        stream: driver.CUstream = driver.CUstream(
            driver.CUstream_flags.CU_STREAM_DEFAULT
        ),
    ):
        self.a_major_mode = utils.LayoutEnum.from_tensor(mA)
        self.b_major_mode = utils.LayoutEnum.from_tensor(mB)
        self.c_major_mode = utils.LayoutEnum.from_tensor(mC)

        # ///////////////////////////////////////////////////////////////////////////////
        # Create layouts for shared memory for A and B:
        #   - sA/sB is m/n-major to vectorized copies from shared
        #       memory to registers. This is because the MMA layouts
        #       for sA/sB are also m/n-major
        #   - When gA/gB is k-major, pad 4 elements to reduce bank conflicts
        # ///////////////////////////////////////////////////////////////////////////////

        padding_a = 4 if self.a_major_mode == utils.LayoutEnum.ROW_MAJOR else 0
        padding_b = 4 if self.b_major_mode == utils.LayoutEnum.ROW_MAJOR else 0
        sA_layout = cute.make_layout(
            (self._bM, self._bK, self._num_stages),
            stride=(1, (self._bM + padding_a), self._bK * (self._bM + padding_a)),
        )
        sB_layout = cute.make_layout(
            (self._bN, self._bK, self._num_stages),
            stride=(1, (self._bN + padding_b), self._bK * (self._bN + padding_b)),
        )

        # ///////////////////////////////////////////////////////////////////////////////
        # Create copy layouts that will be used for asynchronous
        # global memory -> shared memory copies:
        #   - The majorness of tA/tB follows the majorness of gA/gB
        #   - For k-major, these layouts will copy values one-by-one from
        #       from global memory, without vectorizing
        #   - For m/n-major, it will vectorize to a 128bit copy for faster
        #       data transfer between global and shared memory, as long
        #       as the alignment of the tensor allows it. Otherwise, it
        #       defaults to a non-vectorized copy
        # ///////////////////////////////////////////////////////////////////////////////

        tA = cute.make_layout(
            (self._num_threads // self._bK, self._bK), stride=(self._bK, 1)
        )
        tB = cute.make_layout(
            (self._num_threads // self._bK, self._bK), stride=(self._bK, 1)
        )
        vA = cute.make_layout((1, 1))
        vB = cute.make_layout((1, 1))
        atom_async_copy_A = cute.make_copy_atom(
            cute.nvgpu.cpasync.CopyG2SOp(),
            mA.element_type,
            num_bits_per_copy=mA.element_type.width,
        )
        atom_async_copy_B = cute.make_copy_atom(
            cute.nvgpu.cpasync.CopyG2SOp(),
            mA.element_type,
            num_bits_per_copy=mB.element_type.width,
        )
        if cutlass.const_expr(self.a_major_mode == utils.LayoutEnum.COL_MAJOR):
            num_vectorized = 4 if (mA.layout[0].max_alignment % 16 == 0) else 1
            atom_async_copy_A = cute.make_copy_atom(
                cute.nvgpu.cpasync.CopyG2SOp(),
                mA.element_type,
                num_bits_per_copy=mA.element_type.width * num_vectorized,
            )
            major_mode_size = self._bM // num_vectorized
            tA = cute.make_layout(
                (major_mode_size, self._num_threads // major_mode_size),
                stride=(1, major_mode_size),
            )
            vA = cute.make_layout((num_vectorized, 1))

        if cutlass.const_expr(self.b_major_mode == utils.LayoutEnum.COL_MAJOR):
            num_vectorized = 4 if (mB.layout[0].max_alignment % 16 == 0) else 1
            atom_async_copy_B = cute.make_copy_atom(
                cute.nvgpu.cpasync.CopyG2SOp(),
                mA.element_type,
                num_bits_per_copy=mB.element_type.width * num_vectorized,
            )
            major_mode_size = self._bN // num_vectorized
            tB = cute.make_layout(
                (major_mode_size, self._num_threads // major_mode_size),
                stride=(1, major_mode_size),
            )
            vB = cute.make_layout((num_vectorized, 1))

        tiled_copy_A = cute.make_tiled_copy_tv(atom_async_copy_A, tA, vA)
        tiled_copy_B = cute.make_tiled_copy_tv(atom_async_copy_B, tB, vB)

        # ///////////////////////////////////////////////////////////////////////////////
        # Create layouts for GEMM:
        # We tile an MMA atom across a tensor. `atoms_layout` is the layout
        # of atoms in the tiled MMA. (Because we use an `MmaUniversalOp`,
        # which has a trivial 1x1x1 MMA trait, `atoms_layout` is also
        # simply the thread layout for C.) `permutation_tiler` reorders the
        # elements of the tensor that the tiled MMA is applied to.
        # Different combinations of `atoms_layout` and `permutation_tiler`
        # values can create different MMA thread-value patterns.
        #
        # Here, the MMA layout is set so that each thread copies four
        # consecutive elements from shared memory to registers.
        # `permutation_tiler_M/N` maps the elements handled by each thread
        # to the permuted element in the tensor.
        # For increasing indices in the tensor, the thread ID that reads it is:
        #   - (without permutation) ==>
        #      0 1 2 ... 15 0 1 2 ... 15 0 1 2 ... 15 0 1 2 ... 15 ......
        #   - (with permutation) ==>
        #      0 0 0 0 1 1 1 1 2 2 2 2 ... 15 15 15 15 0 0 0 0 1 1 1 1 ......
        # ///////////////////////////////////////////////////////////////////////////////
        atoms_layout = cute.make_layout(
            (self._num_threads // 16, 16, 1), stride=(16, 1, 0)
        )
        if cutlass.const_expr(self.c_major_mode == utils.LayoutEnum.COL_MAJOR):
            atoms_layout = cute.make_layout(
                (16, self._num_threads // 16, 1), stride=(1, 16, 0)
            )
        op = cute.nvgpu.MmaUniversalOp(cutlass.Float32)
        permutation_tiler_M = cute.make_layout(
            (atoms_layout.shape[0], 4), stride=(4, 1)
        )
        permutation_tiler_N = cute.make_layout(
            (atoms_layout.shape[1], 4), stride=(4, 1)
        )
        tiled_mma = cute.make_tiled_mma(
            op,
            atoms_layout,
            permutation_mnk=(permutation_tiler_M, permutation_tiler_N, None),
        )

        # grid_dim: ((m + BLK_M - 1) // BLK_M, (n + BLK_N - 1) // BLK_N, 1)
        grid_dim = *cute.ceil_div(mC.shape, (self._bM, self._bN)), 1

        self.kernel(
            mA,
            mB,
            mC,
            sA_layout,
            sB_layout,
            tiled_copy_A,
            tiled_copy_B,
            tiled_mma,
            epilogue_op,
        ).launch(
            grid=grid_dim,
            block=[cute.size(atoms_layout), 1, 1],
            stream=stream,
        )

    @cute.kernel
    def kernel(
        self,
        mA: cute.Tensor,
        mB: cute.Tensor,
        mC: cute.Tensor,
        sA_layout: cute.Layout,
        sB_layout: cute.Layout,
        tiled_copy_A: cute.TiledCopy,
        tiled_copy_B: cute.TiledCopy,
        tiled_mma: cute.TiledMma,
        epilogue_op: cutlass.Constexpr = lambda x: x,
    ):
        # Thread and block indices
        tidx, tidy, tidz = cute.arch.thread_idx()
        bidx, bidy, bidz = cute.arch.block_idx()
        tiler_coord = (bidx, bidy, None)
        thr_mma = tiled_mma.get_slice(tidx)

        # ///////////////////////////////////////////////////////////////////////////////
        # Get the appropriate tiles for this thread block.
        # gA: (BLK_M, BLK_K, k), gB: (BLK_N, BLK_K, k), gC: (BLK_M, BLK_N)
        # ///////////////////////////////////////////////////////////////////////////////
        gA = cute.local_tile(
            mA, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, None, 1)
        )
        gB = cute.local_tile(
            mB, tiler=self._cta_tiler, coord=tiler_coord, proj=(None, 1, 1)
        )
        gC = cute.local_tile(
            mC, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, 1, None)
        )

        # Move the pointer of gA/gB in the `-k`` direction, making the first
        # tile (instead of the last one) irregular in shape when k is irregular.
        # We first handle the irregular tile to avoid checking for this
        # condition within the mainloop.
        residue_k = mA.shape[1] - self._bK * gA.shape[2]
        gA = cute.domain_offset((0, residue_k, 0), gA)
        gB = cute.domain_offset((0, residue_k, 0), gB)

        # ///////////////////////////////////////////////////////////////////////////////
        # Get the appropriate tiles for this thread.
        # sA:   (BLK_M, BLK_K, PIPE)       , sB:   (BLK_N, BLK_K, PIPE)
        # tAgA: (CPY, CPY_M, CPY_K, k)     , tBgB: (CPY, CPY_N, CPY_K, k)
        # tAsA: (CPY, CPY_M, CPY_K, PIPE)  , tBsB: (CPY, CPY_N, CPY_K, PIPE)
        # ///////////////////////////////////////////////////////////////////////////////
        # Create shared memory buffer
        smem = cutlass.utils.SmemAllocator()
        sA = smem.allocate_tensor(mA.element_type, sA_layout, 16)
        sB = smem.allocate_tensor(mB.element_type, sB_layout, 16)
        thr_copy_A = tiled_copy_A.get_slice(tidx)
        thr_copy_B = tiled_copy_B.get_slice(tidx)
        tAgA = thr_copy_A.partition_S(gA)
        tAsA = thr_copy_A.partition_D(sA)
        tBgB = thr_copy_B.partition_S(gB)
        tBsB = thr_copy_B.partition_D(sB)

        # ///////////////////////////////////////////////////////////////////////////////
        # Predicate: Mark indices that need to copy when the problem shape
        # isn't a multiple of the tile shape. If tApA/B[i] is 0, then do not
        # do the copy atom associated with index i.
        # cA:    (BLK_M, BLK_K)      => (blk_m, blk_k)
        # cB:    (BLK_N, BLK_K)      => (blk_n, blk_k)
        # tAcA:  (CPY, CPY_M, CPY_K) => (blk_m, blk_k)
        # tBcB:  (CPY, CPY_N, CPY_K) => (blk_n, blk_k)
        # tApA: (rest_v, CPY_M, CPY_K), stride=(..., ..., 0)
        # tBpB: (rest_v, CPY_N, CPY_K), stride=(..., ..., 0)
        # CPY =  (atom_v, rest_v)
        # ///////////////////////////////////////////////////////////////////////////////
        # Construct identity layout for sA and sB, used for predication
        mcA = cute.make_identity_tensor(mA.shape)
        mcB = cute.make_identity_tensor(mB.shape)
        cA = cute.local_tile(
            mcA, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, None, 1)
        )
        cB = cute.local_tile(
            mcB, tiler=self._cta_tiler, coord=tiler_coord, proj=(None, 1, 1)
        )
        cA = cute.domain_offset((0, residue_k, 0), cA)
        cB = cute.domain_offset((0, residue_k, 0), cB)
        # Repeat the partitioning with identity layouts
        tAcA = thr_copy_A.partition_S(cA)
        tBcB = thr_copy_B.partition_S(cB)
        # Allocate predicate tensors for m and n
        tApA = cute.make_rmem_tensor(
            cute.make_layout(
                (
                    tAsA.shape[0][1],
                    cute.size(tAsA, mode=[1]),
                    cute.size(tAsA, mode=[2]),
                ),
                stride=(cute.size(tAsA, mode=[1]), 1, 0),
            ),
            cutlass.Boolean,
        )
        tBpB = cute.make_rmem_tensor(
            cute.make_layout(
                (
                    tBsB.shape[0][1],
                    cute.size(tBsB, mode=[1]),
                    cute.size(tBsB, mode=[2]),
                ),
                stride=(cute.size(tBsB, mode=[1]), 1, 0),
            ),
            cutlass.Boolean,
        )
        # Allocate predicate tensors for m, n and k for residue k-tile
        tApA_residue_k = cute.make_rmem_tensor(
            cute.make_layout(
                (
                    tAsA.shape[0][1],
                    cute.size(tAsA, mode=[1]),
                    cute.size(tAsA, mode=[2]),
                ),
                stride=(
                    cute.size(tAsA, mode=[1]) * cute.size(tAsA, mode=[2]),
                    cute.size(tAsA, mode=[2]),
                    1,
                ),
            ),
            cutlass.Boolean,
        )
        tBpB_residue_k = cute.make_rmem_tensor(
            cute.make_layout(
                (
                    tBsB.shape[0][1],
                    cute.size(tBsB, mode=[1]),
                    cute.size(tBsB, mode=[2]),
                ),
                stride=(
                    cute.size(tBsB, mode=[1]) * cute.size(tBsB, mode=[2]),
                    cute.size(tBsB, mode=[2]),
                    1,
                ),
            ),
            cutlass.Boolean,
        )
        # Set predicates for m/n bounds for mainloop
        for rest_v in range(tApA.shape[0]):
            for m in range(tApA.shape[1]):
                tApA[rest_v, m, 0] = cute.elem_less(
                    tAcA[(0, rest_v), m, 0, 0][0], mA.shape[0]
                )
        for rest_v in range(tBpB.shape[0]):
            for n in range(tBpB.shape[1]):
                tBpB[rest_v, n, 0] = cute.elem_less(
                    tBcB[(0, rest_v), n, 0, 0][0], mB.shape[0]
                )

        # Set predicates for m/n/k bounds for residue k tile
        for rest_v in range(tApA_residue_k.shape[0]):
            for m in range(tApA_residue_k.shape[1]):
                for k in range(tApA_residue_k.shape[2]):
                    coord_A = tAcA[(0, rest_v), m, k, 0]
                    tApA_residue_k[rest_v, m, k] = cute.elem_less(
                        (coord_A[0], cutlass.Int32(-1)), (mA.shape[0], coord_A[1])
                    )
        for rest_v in range(tBpB_residue_k.shape[0]):
            for n in range(tBpB_residue_k.shape[1]):
                for k in range(tBpB_residue_k.shape[2]):
                    coord_B = tBcB[(0, rest_v), n, k, 0]
                    tBpB_residue_k[rest_v, n, k] = cute.elem_less(
                        (coord_B[0], cutlass.Int32(-1)), (mB.shape[0], coord_B[1])
                    )

        # ///////////////////////////////////////////////////////////////////////////////
        # Prefetch Prologue
        # ///////////////////////////////////////////////////////////////////////////////
        # Start async loads for 0th k-tile, where we take care of the k-residue
        k_pipe_max = cute.size(tAsA, mode=[3])
        k_tile_count = cute.size(tAgA, mode=[3])
        gmem_pipe_read = cutlass.Int32(0)
        cute.copy(
            tiled_copy_A,
            tAgA[None, None, None, gmem_pipe_read],
            tAsA[None, None, None, 0],
            pred=tApA_residue_k,
        )
        cute.copy(
            tiled_copy_B,
            tBgB[None, None, None, gmem_pipe_read],
            tBsB[None, None, None, 0],
            pred=tBpB_residue_k,
        )
        cute.arch.cp_async_commit_group()
        gmem_pipe_read = (
            gmem_pipe_read + 1
            if gmem_pipe_read + 1 < k_tile_count
            else cutlass.Int32(0)
        )
        # Start async loads for 1st k-tile onwards, no k-residue handling needed
        for k_tile in range(1, k_pipe_max - 1):
            if k_tile < k_tile_count:
                cute.copy(
                    tiled_copy_A,
                    tAgA[None, None, None, gmem_pipe_read],
                    tAsA[None, None, None, k_tile],
                    pred=tApA,
                )
                cute.copy(
                    tiled_copy_B,
                    tBgB[None, None, None, gmem_pipe_read],
                    tBsB[None, None, None, k_tile],
                    pred=tBpB,
                )

            gmem_pipe_read = (
                gmem_pipe_read + 1
                if gmem_pipe_read + 1 < k_tile_count
                else cutlass.Int32(0)
            )
            cute.arch.cp_async_commit_group()

        # all tiles have been copied from global memory, so clear the
        # predicate tensor
        if k_tile_count < k_pipe_max:
            for rest_v in range(tApA.shape[0]):
                for m in range(tApA.shape[1]):
                    tApA[rest_v, m, 0] = cutlass.Boolean(0)
            for rest_v in range(tBpB.shape[0]):
                for n in range(tBpB.shape[1]):
                    tBpB[rest_v, n, 0] = cutlass.Boolean(0)

        # ///////////////////////////////////////////////////////////////////////////////
        # Define A/B partitioning and C accumulators.
        # ///////////////////////////////////////////////////////////////////////////////
        tCsA = thr_mma.partition_A(sA)
        tCsB = thr_mma.partition_B(sB)
        tCgC = thr_mma.partition_C(gC)
        tCrA = tiled_mma.make_fragment_A(tCsA[None, None, None, 0])
        tCrB = tiled_mma.make_fragment_B(tCsB[None, None, None, 0])
        tCrC = tiled_mma.make_fragment_C(tCgC)
        # Clear the accumulator
        tCrC.fill(0.0)

        # Current pipe index in smem to read from / write to
        smem_pipe_read = cutlass.Int32(0)
        smem_pipe_write = cutlass.Int32(k_pipe_max - 1)

        tCsA_p = tCsA[None, None, None, smem_pipe_read]
        tCsB_p = tCsB[None, None, None, smem_pipe_read]

        # ///////////////////////////////////////////////////////////////////////////////
        # PREFETCH register pipeline
        # ///////////////////////////////////////////////////////////////////////////////
        k_block_max = cute.size(tCrA, mode=[2])

        if k_block_max > 1:
            # Wait until our first prefetched tile is loaded in
            cute.arch.cp_async_wait_group(k_pipe_max - 2)
            self.cta_sync_barrier.arrive_and_wait()
            # Prefetch the first rmem from the first k-tile
            cute.autovec_copy(tCsA_p[None, None, 0], tCrA[None, None, 0])
            cute.autovec_copy(tCsB_p[None, None, 0], tCrB[None, None, 0])

        # ///////////////////////////////////////////////////////////////////////////////
        # Mainloop
        # 1. Shared memory pipeline (gmem -> smem):
        #    The default smem pipeline depth is 3, meaning that for shared
        # memory buffers, we allocate three times the size described by the
        # CTA tiler. We prefetch 2 of these buffers before entering the main
        # loop. Considering only the transfer from global memory to shared
        # memory, the general structure of the mainloop is:
        #   (1) copy k-tile from gmem to smem;
        #   (2) perform gemm computation on k-tile;
        #   (3) wait for the next copy to finish.
        #    The `cute.arch.cp_async_wait_group(num_smem_stages - 2)` command
        # waits for the number of unfinished 'copy' to be <= 1. The advantage
        # of this approach is that it allows for simultaneous production
        # (i.e., step (1)) and consumption (i.e., step (2)) of smem.
        #    A common misconception is to prefetch N buffers and rewrite
        # the pipeline logic to wait on N-1 pending copies. The disadvantage
        # of this approach is that it requires fully consuming a buffer in
        # order to open an empty buffer for the next copy.
        # 2. Register pipeline (smem -> register):
        #    Similarly, the register pipeline produces i+1, consumes i, and
        # produces i+2... Notably, i and i+1 do not use the same register,
        # eliminating dependencies on the same register for better parallelism.
        # 3. Combining the smem and register pipelines results in the mainloop.
        # ///////////////////////////////////////////////////////////////////////////////

        for _ in range(k_tile_count):
            for k_block in range(k_block_max, unroll_full=True):
                if k_block == k_block_max - 1:
                    tCsA_p = tCsA[None, None, None, smem_pipe_read]
                    tCsB_p = tCsB[None, None, None, smem_pipe_read]
                    cute.arch.cp_async_wait_group(k_pipe_max - 2)
                    self.cta_sync_barrier.arrive_and_wait()

                # Load A, B from shared memory to registers for k_block + 1
                k_block_next = (k_block + 1) % k_block_max  # static
                cute.autovec_copy(
                    tCsA_p[None, None, k_block_next],
                    tCrA[None, None, k_block_next],
                )
                cute.autovec_copy(
                    tCsB_p[None, None, k_block_next],
                    tCrB[None, None, k_block_next],
                )

                # Fetch next A: To better interleave global memory access and
                # compute instructions, we intentionally use the sequence:
                # copy A, perform GEMM, then copy B.
                if k_block == 0:
                    cute.copy(
                        tiled_copy_A,
                        tAgA[None, None, None, gmem_pipe_read],
                        tAsA[None, None, None, smem_pipe_write],
                        # Use predicates because the m-mode may be irregular
                        pred=tApA,
                    )

                # Thread-level register gemm for k_block
                cute.gemm(
                    tiled_mma,
                    tCrC,
                    tCrA[None, None, k_block],
                    tCrB[None, None, k_block],
                    tCrC,
                )

                # Fetch next B and update smem pipeline read/write
                if k_block == 0:
                    cute.copy(
                        tiled_copy_B,
                        tBgB[None, None, None, gmem_pipe_read],
                        tBsB[None, None, None, smem_pipe_write],
                        # Use predicates because the n-mode may be irregular
                        pred=tBpB,
                    )
                    cute.arch.cp_async_commit_group()
                    smem_pipe_write = smem_pipe_read
                    smem_pipe_read = smem_pipe_read + 1
                    if smem_pipe_read == k_pipe_max:
                        smem_pipe_read = cutlass.Int32(0)
                    # After copying all tiles, we avoid clearing the predicate
                    # tensor in the `mainloop` to prevent increasing its
                    # instruction count. Instead, we continue copying the
                    # first tile, though it won't be used. The 0-th tile is not
                    # copied due to its irregular shape, which could lead to
                    # illegal memory accesses.
                    gmem_pipe_read = (
                        gmem_pipe_read + 1
                        if gmem_pipe_read + 1 < k_tile_count
                        else cutlass.Int32(1)
                    )

        # ///////////////////////////////////////////////////////////////////////////////
        # Epilogue
        # Applies the epilogue operation to the accumulated results and copies
        # them without vectorization.
        # ///////////////////////////////////////////////////////////////////////////////
        cute.arch.cp_async_wait_group(0)
        self.cta_sync_barrier.arrive_and_wait()
        tCrC.store(epilogue_op(tCrC.load()))

        # predicate
        cC = cute.make_identity_tensor(gC.shape)
        tCpC = thr_mma.partition_C(cC)
        predC = cute.make_rmem_tensor(tCrC.layout, cutlass.Boolean)
        residue_m = mC.shape[0] - cutlass.Int32(self._bM) * bidx
        residue_n = mC.shape[1] - cutlass.Int32(self._bN) * bidy
        for i in range(cute.size(tCrC.shape)):
            predC[i] = cute.elem_less(tCpC[i], (residue_m, residue_n))
        atom = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), mC.element_type)
        cute.copy(atom, tCrC, tCgC, pred=predC)
        return


def cutlass_gemm(state: bench.State) -> None:
    n = state.get_int64("N")
    r = state.get_int64("R")

    dt = np.float32
    A_h = np.random.randn(n, r).astype(dt)
    B_h = np.copy(A_h.mT)
    C_h = np.eye(n, dtype=dt)

    if n >= 1024:
        # allow more time for large inputs
        state.set_timeout(360)

    dev_id = state.get_device()
    cs = state.get_stream()
    s = as_bindings_Stream(cs)
    core_s = as_core_Stream(cs)

    A_d = core.DeviceMemoryResource(dev_id).allocate(A_h.nbytes, core_s)
    B_d = core.DeviceMemoryResource(dev_id).allocate(B_h.nbytes, core_s)
    C_d = core.DeviceMemoryResource(dev_id).allocate(C_h.nbytes, core_s)

    driver.cuMemcpyAsync(A_d.handle, A_h.ctypes.data, A_h.nbytes, s)
    driver.cuMemcpyAsync(B_d.handle, B_h.ctypes.data, B_h.nbytes, s)
    driver.cuMemcpyAsync(C_d.handle, C_h.ctypes.data, C_h.nbytes, s)

    A_cp = make_tensor(A_h, A_d, dev_id)
    B_cp = make_tensor(B_h, B_d, dev_id)
    C_cp = make_tensor(C_h, C_d, dev_id)

    sgemm = SGemm()
    _ = sgemm(A_cp, B_cp, C_cp, stream=s)

    def launcher(launch: bench.Launch) -> None:
        s = as_bindings_Stream(launch.get_stream())
        sgemm(A_cp, B_cp, C_cp, stream=s)

    state.exec(launcher)


if __name__ == "__main__":
    gemm_b = bench.register(cutlass_gemm)
    gemm_b.add_int64_axis("R", [16, 64, 256])
    gemm_b.add_int64_axis("N", [256, 512, 1024, 2048])

    bench.run_all_benchmarks(sys.argv)