diff --git a/python/examples/axes.py b/python/examples/axes.py index ce67238..fa7c6c4 100644 --- a/python/examples/axes.py +++ b/python/examples/axes.py @@ -20,7 +20,7 @@ from typing import Dict, Optional, Tuple import cuda.bench as bench import cuda.cccl.headers as headers -import cuda.core.experimental as core +import cuda.core as core def as_core_Stream(cs: bench.CudaStream) -> core.Stream: diff --git a/python/examples/cpu_activity.py b/python/examples/cpu_activity.py index a492ff7..df1efae 100644 --- a/python/examples/cpu_activity.py +++ b/python/examples/cpu_activity.py @@ -19,7 +19,7 @@ import time import cuda.bench as bench import cuda.cccl.headers as headers -import cuda.core.experimental as core +import cuda.core as core host_sleep_duration = 0.1 diff --git a/python/examples/cccl_parallel_segmented_reduce.py b/python/examples/cuda_compute_segmented_reduce.py similarity index 69% rename from python/examples/cccl_parallel_segmented_reduce.py rename to python/examples/cuda_compute_segmented_reduce.py index e54a77b..d2140b9 100644 --- a/python/examples/cccl_parallel_segmented_reduce.py +++ b/python/examples/cuda_compute_segmented_reduce.py @@ -17,36 +17,20 @@ import sys import cuda.bench as bench -import cuda.cccl.parallel.experimental.algorithms as algorithms -import cuda.cccl.parallel.experimental.iterators as iterators -import cuda.core.experimental as core +import cuda.compute.algorithms as algorithms +import cuda.compute.iterators as iterators +import cuda.core as core import cupy as cp import numpy as np - - -class CCCLStream: - "Class to work around https://github.com/NVIDIA/cccl/issues/5144" - - def __init__(self, ptr): - self._ptr = ptr - - def __cuda_stream__(self): - return (0, self._ptr) +from cuda.compute import OpKind def as_core_Stream(cs: bench.CudaStream) -> core.Stream: return core.Stream.from_handle(cs.addressof()) -def as_cccl_Stream(cs: bench.CudaStream) -> CCCLStream: - return CCCLStream(cs.addressof()) - - -def as_cp_ExternalStream( - cs: bench.CudaStream, dev_id: int | None = -1 -) -> cp.cuda.ExternalStream: - h = cs.addressof() - return cp.cuda.ExternalStream(h, dev_id) +def as_cp_ExternalStream(cs: bench.CudaStream) -> cp.cuda.ExternalStream: + return cp.cuda.Stream.from_external(cs) def segmented_reduce(state: bench.State): @@ -56,13 +40,8 @@ def segmented_reduce(state: bench.State): n_rows = n_elems // n_cols state.add_summary("numRows", n_rows) - state.collect_cupti_metrics() - dev_id = state.get_device() - cp_stream = as_cp_ExternalStream(state.get_stream(), dev_id) - - def add_op(a, b): - return a + b + cp_stream = as_cp_ExternalStream(state.get_stream()) def make_scaler(step): def scale(row_id): @@ -85,15 +64,24 @@ def segmented_reduce(state: bench.State): d_input = mat d_output = cp.empty(n_rows, dtype=d_input.dtype) - alg = algorithms.segmented_reduce( + add_op = OpKind.PLUS + + alg = algorithms.make_segmented_reduce( d_input, d_output, start_offsets, end_offsets, add_op, h_init ) - cccl_stream = as_cccl_Stream(state.get_stream()) - + cccl_stream = state.get_stream() # query size of temporary storage and allocate temp_nbytes = alg( - None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init, cccl_stream + None, + d_input, + d_output, + add_op, + n_rows, + start_offsets, + end_offsets, + h_init, + cccl_stream, ) h_init = np.zeros(tuple(), dtype=np.int32) @@ -101,11 +89,12 @@ def segmented_reduce(state: bench.State): temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8) def launcher(launch: bench.Launch): - s = as_cccl_Stream(launch.get_stream()) + s = launch.get_stream() alg( temp_storage, d_input, d_output, + add_op, n_rows, start_offsets, end_offsets, diff --git a/python/examples/cccl_cooperative_block_reduce.py b/python/examples/cuda_coop_block_reduce.py similarity index 98% rename from python/examples/cccl_cooperative_block_reduce.py rename to python/examples/cuda_coop_block_reduce.py index 0d5d970..c0f0138 100644 --- a/python/examples/cccl_cooperative_block_reduce.py +++ b/python/examples/cuda_coop_block_reduce.py @@ -17,7 +17,7 @@ import sys import cuda.bench as bench -import cuda.cccl.cooperative.experimental as coop +import cuda.coop as coop import numba import numpy as np from numba import cuda diff --git a/python/examples/cupy_extract.py b/python/examples/cupy_extract.py index 091141c..d7f2a01 100644 --- a/python/examples/cupy_extract.py +++ b/python/examples/cupy_extract.py @@ -20,21 +20,16 @@ import cuda.bench as bench import cupy as cp -def as_cp_ExternalStream( - cs: bench.CudaStream, dev_id: int | None = -1 -) -> cp.cuda.ExternalStream: - h = cs.addressof() - return cp.cuda.ExternalStream(h, dev_id) +def as_cp_ExternalStream(cs: bench.CudaStream): + return cp.cuda.Stream.from_external(cs) def cupy_extract_by_mask(state: bench.State): n_cols = state.get_int64("numCols") n_rows = state.get_int64("numRows") - dev_id = state.get_device() - cp_s = as_cp_ExternalStream(state.get_stream(), dev_id) + cp_s = as_cp_ExternalStream(state.get_stream()) - state.collect_cupti_metrics() state.add_element_count(n_rows * n_cols, "# Elements") int32_dt = cp.dtype(cp.int32) bool_dt = cp.dtype(cp.bool_) @@ -49,7 +44,7 @@ def cupy_extract_by_mask(state: bench.State): _ = X[mask] def launcher(launch: bench.Launch): - with as_cp_ExternalStream(launch.get_stream(), dev_id): + with as_cp_ExternalStream(launch.get_stream()): _ = X[mask] state.exec(launcher, sync=True) diff --git a/python/examples/cute_dsl_sgemm.py b/python/examples/cute_dsl_sgemm.py new file mode 100644 index 0000000..eae3758 --- /dev/null +++ b/python/examples/cute_dsl_sgemm.py @@ -0,0 +1,667 @@ +# Copyright 2025 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 with the LLVM exception +# (the "License"); you may not use this file except in compliance with +# the License. +# +# You may obtain a copy of the License at +# +# http://llvm.org/foundation/relicensing/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import sys +from typing import Tuple + +import cuda.bench as bench +import cuda.bindings.driver as driver +import cuda.core as core +import cupy as cp +import cutlass +import cutlass.cute as cute +import cutlass.pipeline as pipeline +import cutlass.utils as utils +import numpy as np +from cutlass.cute.runtime import from_dlpack + + +def as_bindings_Stream(cs: bench.CudaStream) -> driver.CUstream: + return driver.CUstream(cs.addressof()) + + +def as_core_Stream(cs: bench.CudaStream) -> core.Stream: + return core.Stream.from_handle(cs.addressof()) + + +def make_tensor(arr_h: np.ndarray, dev_buf: core.Buffer, dev_id: int | None): + cp_memview = cp.cuda.UnownedMemory( + int(dev_buf.handle), dev_buf.size, dev_buf, -1 if dev_id is None else dev_id + ) + zero_offset = 0 + return from_dlpack( + cp.ndarray( + arr_h.shape, + dtype=arr_h.dtype, + memptr=cp.cuda.MemoryPointer(cp_memview, zero_offset), + ), + assumed_align=16, + ) + + +class SGemm: + """ + Adapted from https://github.com/NVIDIA/cutlass/blob/main/examples/python/CuTeDSL/ampere/sgemm.py + """ + + def __init__( + self, + cta_tiler: Tuple[int, int, int] = (128, 128, 8), + num_stages: int = 3, + num_threads: int = 256, + ): + self._cta_tiler = cta_tiler + self._num_stages = num_stages + self._num_threads = num_threads + assert num_threads > 0, "needs at least one thread" + assert num_threads % 16 == 0, "multiples of 16 required for MMA thread layout" + + self._bM, self._bN, self._bK = self._cta_tiler + assert self._bM % 16 == 0, "multiple of 16 required for tile dimension M" + assert self._bN % 16 == 0, "multiple of 16 required for tile dimension N" + assert self._num_stages >= 3, "num_stages must be greater than or equal to 3" + self.cta_sync_barrier = pipeline.NamedBarrier( + barrier_id=1, num_threads=num_threads + ) + + @cute.jit + def __call__( + self, + mA: cute.Tensor, + mB: cute.Tensor, + mC: cute.Tensor, + epilogue_op: cutlass.Constexpr = lambda x: x, + stream: driver.CUstream = driver.CUstream( + driver.CUstream_flags.CU_STREAM_DEFAULT + ), + ): + self.a_major_mode = utils.LayoutEnum.from_tensor(mA) + self.b_major_mode = utils.LayoutEnum.from_tensor(mB) + self.c_major_mode = utils.LayoutEnum.from_tensor(mC) + + # /////////////////////////////////////////////////////////////////////////////// + # Create layouts for shared memory for A and B: + # - sA/sB is m/n-major to vectorized copies from shared + # memory to registers. This is because the MMA layouts + # for sA/sB are also m/n-major + # - When gA/gB is k-major, pad 4 elements to reduce bank conflicts + # /////////////////////////////////////////////////////////////////////////////// + + padding_a = 4 if self.a_major_mode == utils.LayoutEnum.ROW_MAJOR else 0 + padding_b = 4 if self.b_major_mode == utils.LayoutEnum.ROW_MAJOR else 0 + sA_layout = cute.make_layout( + (self._bM, self._bK, self._num_stages), + stride=(1, (self._bM + padding_a), self._bK * (self._bM + padding_a)), + ) + sB_layout = cute.make_layout( + (self._bN, self._bK, self._num_stages), + stride=(1, (self._bN + padding_b), self._bK * (self._bN + padding_b)), + ) + + # /////////////////////////////////////////////////////////////////////////////// + # Create copy layouts that will be used for asynchronous + # global memory -> shared memory copies: + # - The majorness of tA/tB follows the majorness of gA/gB + # - For k-major, these layouts will copy values one-by-one from + # from global memory, without vectorizing + # - For m/n-major, it will vectorize to a 128bit copy for faster + # data transfer between global and shared memory, as long + # as the alignment of the tensor allows it. Otherwise, it + # defaults to a non-vectorized copy + # /////////////////////////////////////////////////////////////////////////////// + + tA = cute.make_layout( + (self._num_threads // self._bK, self._bK), stride=(self._bK, 1) + ) + tB = cute.make_layout( + (self._num_threads // self._bK, self._bK), stride=(self._bK, 1) + ) + vA = cute.make_layout((1, 1)) + vB = cute.make_layout((1, 1)) + atom_async_copy_A = cute.make_copy_atom( + cute.nvgpu.cpasync.CopyG2SOp(), + mA.element_type, + num_bits_per_copy=mA.element_type.width, + ) + atom_async_copy_B = cute.make_copy_atom( + cute.nvgpu.cpasync.CopyG2SOp(), + mA.element_type, + num_bits_per_copy=mB.element_type.width, + ) + if cutlass.const_expr(self.a_major_mode == utils.LayoutEnum.COL_MAJOR): + num_vectorized = 4 if (mA.layout[0].max_alignment % 16 == 0) else 1 + atom_async_copy_A = cute.make_copy_atom( + cute.nvgpu.cpasync.CopyG2SOp(), + mA.element_type, + num_bits_per_copy=mA.element_type.width * num_vectorized, + ) + major_mode_size = self._bM // num_vectorized + tA = cute.make_layout( + (major_mode_size, self._num_threads // major_mode_size), + stride=(1, major_mode_size), + ) + vA = cute.make_layout((num_vectorized, 1)) + + if cutlass.const_expr(self.b_major_mode == utils.LayoutEnum.COL_MAJOR): + num_vectorized = 4 if (mB.layout[0].max_alignment % 16 == 0) else 1 + atom_async_copy_B = cute.make_copy_atom( + cute.nvgpu.cpasync.CopyG2SOp(), + mA.element_type, + num_bits_per_copy=mB.element_type.width * num_vectorized, + ) + major_mode_size = self._bN // num_vectorized + tB = cute.make_layout( + (major_mode_size, self._num_threads // major_mode_size), + stride=(1, major_mode_size), + ) + vB = cute.make_layout((num_vectorized, 1)) + + tiled_copy_A = cute.make_tiled_copy_tv(atom_async_copy_A, tA, vA) + tiled_copy_B = cute.make_tiled_copy_tv(atom_async_copy_B, tB, vB) + + # /////////////////////////////////////////////////////////////////////////////// + # Create layouts for GEMM: + # We tile an MMA atom across a tensor. `atoms_layout` is the layout + # of atoms in the tiled MMA. (Because we use an `MmaUniversalOp`, + # which has a trivial 1x1x1 MMA trait, `atoms_layout` is also + # simply the thread layout for C.) `permutation_tiler` reorders the + # elements of the tensor that the tiled MMA is applied to. + # Different combinations of `atoms_layout` and `permutation_tiler` + # values can create different MMA thread-value patterns. + # + # Here, the MMA layout is set so that each thread copies four + # consecutive elements from shared memory to registers. + # `permutation_tiler_M/N` maps the elements handled by each thread + # to the permuted element in the tensor. + # For increasing indices in the tensor, the thread ID that reads it is: + # - (without permutation) ==> + # 0 1 2 ... 15 0 1 2 ... 15 0 1 2 ... 15 0 1 2 ... 15 ...... + # - (with permutation) ==> + # 0 0 0 0 1 1 1 1 2 2 2 2 ... 15 15 15 15 0 0 0 0 1 1 1 1 ...... + # /////////////////////////////////////////////////////////////////////////////// + atoms_layout = cute.make_layout( + (self._num_threads // 16, 16, 1), stride=(16, 1, 0) + ) + if cutlass.const_expr(self.c_major_mode == utils.LayoutEnum.COL_MAJOR): + atoms_layout = cute.make_layout( + (16, self._num_threads // 16, 1), stride=(1, 16, 0) + ) + op = cute.nvgpu.MmaUniversalOp(cutlass.Float32) + permutation_tiler_M = cute.make_layout( + (atoms_layout.shape[0], 4), stride=(4, 1) + ) + permutation_tiler_N = cute.make_layout( + (atoms_layout.shape[1], 4), stride=(4, 1) + ) + tiled_mma = cute.make_tiled_mma( + op, + atoms_layout, + permutation_mnk=(permutation_tiler_M, permutation_tiler_N, None), + ) + + # grid_dim: ((m + BLK_M - 1) // BLK_M, (n + BLK_N - 1) // BLK_N, 1) + grid_dim = *cute.ceil_div(mC.shape, (self._bM, self._bN)), 1 + + self.kernel( + mA, + mB, + mC, + sA_layout, + sB_layout, + tiled_copy_A, + tiled_copy_B, + tiled_mma, + epilogue_op, + ).launch( + grid=grid_dim, + block=[cute.size(atoms_layout), 1, 1], + stream=stream, + ) + + @cute.kernel + def kernel( + self, + mA: cute.Tensor, + mB: cute.Tensor, + mC: cute.Tensor, + sA_layout: cute.Layout, + sB_layout: cute.Layout, + tiled_copy_A: cute.TiledCopy, + tiled_copy_B: cute.TiledCopy, + tiled_mma: cute.TiledMma, + epilogue_op: cutlass.Constexpr = lambda x: x, + ): + # Thread and block indices + tidx, tidy, tidz = cute.arch.thread_idx() + bidx, bidy, bidz = cute.arch.block_idx() + tiler_coord = (bidx, bidy, None) + thr_mma = tiled_mma.get_slice(tidx) + + # /////////////////////////////////////////////////////////////////////////////// + # Get the appropriate tiles for this thread block. + # gA: (BLK_M, BLK_K, k), gB: (BLK_N, BLK_K, k), gC: (BLK_M, BLK_N) + # /////////////////////////////////////////////////////////////////////////////// + gA = cute.local_tile( + mA, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, None, 1) + ) + gB = cute.local_tile( + mB, tiler=self._cta_tiler, coord=tiler_coord, proj=(None, 1, 1) + ) + gC = cute.local_tile( + mC, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, 1, None) + ) + + # Move the pointer of gA/gB in the `-k`` direction, making the first + # tile (instead of the last one) irregular in shape when k is irregular. + # We first handle the irregular tile to avoid checking for this + # condition within the mainloop. + residue_k = mA.shape[1] - self._bK * gA.shape[2] + gA = cute.domain_offset((0, residue_k, 0), gA) + gB = cute.domain_offset((0, residue_k, 0), gB) + + # /////////////////////////////////////////////////////////////////////////////// + # Get the appropriate tiles for this thread. + # sA: (BLK_M, BLK_K, PIPE) , sB: (BLK_N, BLK_K, PIPE) + # tAgA: (CPY, CPY_M, CPY_K, k) , tBgB: (CPY, CPY_N, CPY_K, k) + # tAsA: (CPY, CPY_M, CPY_K, PIPE) , tBsB: (CPY, CPY_N, CPY_K, PIPE) + # /////////////////////////////////////////////////////////////////////////////// + # Create shared memory buffer + smem = cutlass.utils.SmemAllocator() + sA = smem.allocate_tensor(mA.element_type, sA_layout, 16) + sB = smem.allocate_tensor(mB.element_type, sB_layout, 16) + thr_copy_A = tiled_copy_A.get_slice(tidx) + thr_copy_B = tiled_copy_B.get_slice(tidx) + tAgA = thr_copy_A.partition_S(gA) + tAsA = thr_copy_A.partition_D(sA) + tBgB = thr_copy_B.partition_S(gB) + tBsB = thr_copy_B.partition_D(sB) + + # /////////////////////////////////////////////////////////////////////////////// + # Predicate: Mark indices that need to copy when the problem shape + # isn't a multiple of the tile shape. If tApA/B[i] is 0, then do not + # do the copy atom associated with index i. + # cA: (BLK_M, BLK_K) => (blk_m, blk_k) + # cB: (BLK_N, BLK_K) => (blk_n, blk_k) + # tAcA: (CPY, CPY_M, CPY_K) => (blk_m, blk_k) + # tBcB: (CPY, CPY_N, CPY_K) => (blk_n, blk_k) + # tApA: (rest_v, CPY_M, CPY_K), stride=(..., ..., 0) + # tBpB: (rest_v, CPY_N, CPY_K), stride=(..., ..., 0) + # CPY = (atom_v, rest_v) + # /////////////////////////////////////////////////////////////////////////////// + # Construct identity layout for sA and sB, used for predication + mcA = cute.make_identity_tensor(mA.shape) + mcB = cute.make_identity_tensor(mB.shape) + cA = cute.local_tile( + mcA, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, None, 1) + ) + cB = cute.local_tile( + mcB, tiler=self._cta_tiler, coord=tiler_coord, proj=(None, 1, 1) + ) + cA = cute.domain_offset((0, residue_k, 0), cA) + cB = cute.domain_offset((0, residue_k, 0), cB) + # Repeat the partitioning with identity layouts + tAcA = thr_copy_A.partition_S(cA) + tBcB = thr_copy_B.partition_S(cB) + # Allocate predicate tensors for m and n + tApA = cute.make_rmem_tensor( + cute.make_layout( + ( + tAsA.shape[0][1], + cute.size(tAsA, mode=[1]), + cute.size(tAsA, mode=[2]), + ), + stride=(cute.size(tAsA, mode=[1]), 1, 0), + ), + cutlass.Boolean, + ) + tBpB = cute.make_rmem_tensor( + cute.make_layout( + ( + tBsB.shape[0][1], + cute.size(tBsB, mode=[1]), + cute.size(tBsB, mode=[2]), + ), + stride=(cute.size(tBsB, mode=[1]), 1, 0), + ), + cutlass.Boolean, + ) + # Allocate predicate tensors for m, n and k for residue k-tile + tApA_residue_k = cute.make_rmem_tensor( + cute.make_layout( + ( + tAsA.shape[0][1], + cute.size(tAsA, mode=[1]), + cute.size(tAsA, mode=[2]), + ), + stride=( + cute.size(tAsA, mode=[1]) * cute.size(tAsA, mode=[2]), + cute.size(tAsA, mode=[2]), + 1, + ), + ), + cutlass.Boolean, + ) + tBpB_residue_k = cute.make_rmem_tensor( + cute.make_layout( + ( + tBsB.shape[0][1], + cute.size(tBsB, mode=[1]), + cute.size(tBsB, mode=[2]), + ), + stride=( + cute.size(tBsB, mode=[1]) * cute.size(tBsB, mode=[2]), + cute.size(tBsB, mode=[2]), + 1, + ), + ), + cutlass.Boolean, + ) + # Set predicates for m/n bounds for mainloop + for rest_v in range(tApA.shape[0]): + for m in range(tApA.shape[1]): + tApA[rest_v, m, 0] = cute.elem_less( + tAcA[(0, rest_v), m, 0, 0][0], mA.shape[0] + ) + for rest_v in range(tBpB.shape[0]): + for n in range(tBpB.shape[1]): + tBpB[rest_v, n, 0] = cute.elem_less( + tBcB[(0, rest_v), n, 0, 0][0], mB.shape[0] + ) + + # Set predicates for m/n/k bounds for residue k tile + for rest_v in range(tApA_residue_k.shape[0]): + for m in range(tApA_residue_k.shape[1]): + for k in range(tApA_residue_k.shape[2]): + coord_A = tAcA[(0, rest_v), m, k, 0] + tApA_residue_k[rest_v, m, k] = cute.elem_less( + (coord_A[0], cutlass.Int32(-1)), (mA.shape[0], coord_A[1]) + ) + for rest_v in range(tBpB_residue_k.shape[0]): + for n in range(tBpB_residue_k.shape[1]): + for k in range(tBpB_residue_k.shape[2]): + coord_B = tBcB[(0, rest_v), n, k, 0] + tBpB_residue_k[rest_v, n, k] = cute.elem_less( + (coord_B[0], cutlass.Int32(-1)), (mB.shape[0], coord_B[1]) + ) + + # /////////////////////////////////////////////////////////////////////////////// + # Prefetch Prologue + # /////////////////////////////////////////////////////////////////////////////// + # Start async loads for 0th k-tile, where we take care of the k-residue + k_pipe_max = cute.size(tAsA, mode=[3]) + k_tile_count = cute.size(tAgA, mode=[3]) + gmem_pipe_read = cutlass.Int32(0) + cute.copy( + tiled_copy_A, + tAgA[None, None, None, gmem_pipe_read], + tAsA[None, None, None, 0], + pred=tApA_residue_k, + ) + cute.copy( + tiled_copy_B, + tBgB[None, None, None, gmem_pipe_read], + tBsB[None, None, None, 0], + pred=tBpB_residue_k, + ) + cute.arch.cp_async_commit_group() + gmem_pipe_read = ( + gmem_pipe_read + 1 + if gmem_pipe_read + 1 < k_tile_count + else cutlass.Int32(0) + ) + # Start async loads for 1st k-tile onwards, no k-residue handling needed + for k_tile in range(1, k_pipe_max - 1): + if k_tile < k_tile_count: + cute.copy( + tiled_copy_A, + tAgA[None, None, None, gmem_pipe_read], + tAsA[None, None, None, k_tile], + pred=tApA, + ) + cute.copy( + tiled_copy_B, + tBgB[None, None, None, gmem_pipe_read], + tBsB[None, None, None, k_tile], + pred=tBpB, + ) + + gmem_pipe_read = ( + gmem_pipe_read + 1 + if gmem_pipe_read + 1 < k_tile_count + else cutlass.Int32(0) + ) + cute.arch.cp_async_commit_group() + + # all tiles have been copied from global memory, so clear the + # predicate tensor + if k_tile_count < k_pipe_max: + for rest_v in range(tApA.shape[0]): + for m in range(tApA.shape[1]): + tApA[rest_v, m, 0] = cutlass.Boolean(0) + for rest_v in range(tBpB.shape[0]): + for n in range(tBpB.shape[1]): + tBpB[rest_v, n, 0] = cutlass.Boolean(0) + + # /////////////////////////////////////////////////////////////////////////////// + # Define A/B partitioning and C accumulators. + # /////////////////////////////////////////////////////////////////////////////// + tCsA = thr_mma.partition_A(sA) + tCsB = thr_mma.partition_B(sB) + tCgC = thr_mma.partition_C(gC) + tCrA = tiled_mma.make_fragment_A(tCsA[None, None, None, 0]) + tCrB = tiled_mma.make_fragment_B(tCsB[None, None, None, 0]) + tCrC = tiled_mma.make_fragment_C(tCgC) + # Clear the accumulator + tCrC.fill(0.0) + + # Current pipe index in smem to read from / write to + smem_pipe_read = cutlass.Int32(0) + smem_pipe_write = cutlass.Int32(k_pipe_max - 1) + + tCsA_p = tCsA[None, None, None, smem_pipe_read] + tCsB_p = tCsB[None, None, None, smem_pipe_read] + + # /////////////////////////////////////////////////////////////////////////////// + # PREFETCH register pipeline + # /////////////////////////////////////////////////////////////////////////////// + k_block_max = cute.size(tCrA, mode=[2]) + + if k_block_max > 1: + # Wait until our first prefetched tile is loaded in + cute.arch.cp_async_wait_group(k_pipe_max - 2) + self.cta_sync_barrier.arrive_and_wait() + # Prefetch the first rmem from the first k-tile + cute.autovec_copy(tCsA_p[None, None, 0], tCrA[None, None, 0]) + cute.autovec_copy(tCsB_p[None, None, 0], tCrB[None, None, 0]) + + # /////////////////////////////////////////////////////////////////////////////// + # Mainloop + # 1. Shared memory pipeline (gmem -> smem): + # The default smem pipeline depth is 3, meaning that for shared + # memory buffers, we allocate three times the size described by the + # CTA tiler. We prefetch 2 of these buffers before entering the main + # loop. Considering only the transfer from global memory to shared + # memory, the general structure of the mainloop is: + # (1) copy k-tile from gmem to smem; + # (2) perform gemm computation on k-tile; + # (3) wait for the next copy to finish. + # The `cute.arch.cp_async_wait_group(num_smem_stages - 2)` command + # waits for the number of unfinished 'copy' to be <= 1. The advantage + # of this approach is that it allows for simultaneous production + # (i.e., step (1)) and consumption (i.e., step (2)) of smem. + # A common misconception is to prefetch N buffers and rewrite + # the pipeline logic to wait on N-1 pending copies. The disadvantage + # of this approach is that it requires fully consuming a buffer in + # order to open an empty buffer for the next copy. + # 2. Register pipeline (smem -> register): + # Similarly, the register pipeline produces i+1, consumes i, and + # produces i+2... Notably, i and i+1 do not use the same register, + # eliminating dependencies on the same register for better parallelism. + # 3. Combining the smem and register pipelines results in the mainloop. + # /////////////////////////////////////////////////////////////////////////////// + + for _ in range(k_tile_count): + for k_block in range(k_block_max, unroll_full=True): + if k_block == k_block_max - 1: + tCsA_p = tCsA[None, None, None, smem_pipe_read] + tCsB_p = tCsB[None, None, None, smem_pipe_read] + cute.arch.cp_async_wait_group(k_pipe_max - 2) + self.cta_sync_barrier.arrive_and_wait() + + # Load A, B from shared memory to registers for k_block + 1 + k_block_next = (k_block + 1) % k_block_max # static + cute.autovec_copy( + tCsA_p[None, None, k_block_next], + tCrA[None, None, k_block_next], + ) + cute.autovec_copy( + tCsB_p[None, None, k_block_next], + tCrB[None, None, k_block_next], + ) + + # Fetch next A: To better interleave global memory access and + # compute instructions, we intentionally use the sequence: + # copy A, perform GEMM, then copy B. + if k_block == 0: + cute.copy( + tiled_copy_A, + tAgA[None, None, None, gmem_pipe_read], + tAsA[None, None, None, smem_pipe_write], + # Use predicates because the m-mode may be irregular + pred=tApA, + ) + + # Thread-level register gemm for k_block + cute.gemm( + tiled_mma, + tCrC, + tCrA[None, None, k_block], + tCrB[None, None, k_block], + tCrC, + ) + + # Fetch next B and update smem pipeline read/write + if k_block == 0: + cute.copy( + tiled_copy_B, + tBgB[None, None, None, gmem_pipe_read], + tBsB[None, None, None, smem_pipe_write], + # Use predicates because the n-mode may be irregular + pred=tBpB, + ) + cute.arch.cp_async_commit_group() + smem_pipe_write = smem_pipe_read + smem_pipe_read = smem_pipe_read + 1 + if smem_pipe_read == k_pipe_max: + smem_pipe_read = cutlass.Int32(0) + # After copying all tiles, we avoid clearing the predicate + # tensor in the `mainloop` to prevent increasing its + # instruction count. Instead, we continue copying the + # first tile, though it won't be used. The 0-th tile is not + # copied due to its irregular shape, which could lead to + # illegal memory accesses. + gmem_pipe_read = ( + gmem_pipe_read + 1 + if gmem_pipe_read + 1 < k_tile_count + else cutlass.Int32(1) + ) + + # /////////////////////////////////////////////////////////////////////////////// + # Epilogue + # Applies the epilogue operation to the accumulated results and copies + # them without vectorization. + # /////////////////////////////////////////////////////////////////////////////// + cute.arch.cp_async_wait_group(0) + self.cta_sync_barrier.arrive_and_wait() + tCrC.store(epilogue_op(tCrC.load())) + + # predicate + cC = cute.make_identity_tensor(gC.shape) + tCpC = thr_mma.partition_C(cC) + predC = cute.make_rmem_tensor(tCrC.layout, cutlass.Boolean) + residue_m = mC.shape[0] - cutlass.Int32(self._bM) * bidx + residue_n = mC.shape[1] - cutlass.Int32(self._bN) * bidy + for i in range(cute.size(tCrC.shape)): + predC[i] = cute.elem_less(tCpC[i], (residue_m, residue_n)) + atom = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), mC.element_type) + cute.copy(atom, tCrC, tCgC, pred=predC) + return + + +def cutlass_gemm(state: bench.State) -> None: + n = state.get_int64("N") + r = state.get_int64("R") + + dt = np.float32 + A_h = np.random.randn(n, r).astype(dt) + B_h = np.copy(A_h.mT) + C_h = np.eye(n, dtype=dt) + + if n >= 1024: + # allow more time for large inputs + state.set_timeout(360) + + dev_id = state.get_device() + cs = state.get_stream() + s = as_bindings_Stream(cs) + core_s = as_core_Stream(cs) + + A_d = core.DeviceMemoryResource(dev_id).allocate(A_h.nbytes, core_s) + B_d = core.DeviceMemoryResource(dev_id).allocate(B_h.nbytes, core_s) + C_d = core.DeviceMemoryResource(dev_id).allocate(C_h.nbytes, core_s) + + driver.cuMemcpyAsync(A_d.handle, A_h.ctypes.data, A_h.nbytes, s) + driver.cuMemcpyAsync(B_d.handle, B_h.ctypes.data, B_h.nbytes, s) + driver.cuMemcpyAsync(C_d.handle, C_h.ctypes.data, C_h.nbytes, s) + + A_cp = make_tensor(A_h, A_d, dev_id) + B_cp = make_tensor(B_h, B_d, dev_id) + C_cp = make_tensor(C_h, C_d, dev_id) + + sgemm = SGemm() + _ = sgemm(A_cp, B_cp, C_cp, stream=s) + + def launcher(launch: bench.Launch) -> None: + s = as_bindings_Stream(launch.get_stream()) + sgemm(A_cp, B_cp, C_cp, stream=s) + + state.exec(launcher) + + +def patch_cute_dsl(): + def _no_op_diagnostic(self): + return + + try: + import cutlass.base_dsl.dsl as dsl_m + + base_dsl_k = dsl_m.BaseDSL + if hasattr(base_dsl_k, "diagnostic"): + base_dsl_k.diagnostic = _no_op_diagnostic + except (ModuleNotFoundError, AttributeError): + pass + + +if __name__ == "__main__": + # see https://github.com/NVIDIA/cutlass/issues/3142 + patch_cute_dsl() + + gemm_b = bench.register(cutlass_gemm) + gemm_b.add_int64_axis("R", [16, 64, 256]) + gemm_b.add_int64_axis("N", [256, 512, 1024, 2048]) + + bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/cutlass_gemm.py b/python/examples/cutlass_gemm.py deleted file mode 100644 index cd62f39..0000000 --- a/python/examples/cutlass_gemm.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright 2025 NVIDIA Corporation -# -# Licensed under the Apache License, Version 2.0 with the LLVM exception -# (the "License"); you may not use this file except in compliance with -# the License. -# -# You may obtain a copy of the License at -# -# http://llvm.org/foundation/relicensing/LICENSE.txt -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import sys - -import cuda.bench as bench -import cuda.bindings.driver as driver -import cuda.core.experimental as core -import cupy as cp -import cutlass -import numpy as np - - -def as_bindings_Stream(cs: bench.CudaStream) -> driver.CUstream: - return driver.CUstream(cs.addressof()) - - -def as_core_Stream(cs: bench.CudaStream) -> core.Stream: - return core.Stream.from_handle(cs.addressof()) - - -def make_cp_array( - arr_h: np.ndarray, dev_buf: core.Buffer, dev_id: int | None -) -> cp.ndarray: - cp_memview = cp.cuda.UnownedMemory( - int(dev_buf.handle), dev_buf.size, dev_buf, -1 if dev_id is None else dev_id - ) - zero_offset = 0 - return cp.ndarray( - arr_h.shape, - dtype=arr_h.dtype, - memptr=cp.cuda.MemoryPointer(cp_memview, zero_offset), - ) - - -def cutlass_gemm(state: bench.State) -> None: - n = state.get_int64("N") - r = state.get_int64("R") - - alpha = state.get_float64("alpha") - - dt = np.float64 - A_h = np.random.randn(n, r).astype(dt) - B_h = np.copy(A_h.mT) - C_h = np.eye(n, dtype=dt) - D_h = np.zeros_like(C_h) - - if n >= 1024: - # allow more time for large inputs - state.set_timeout(360) - - dev_id = state.get_device() - cs = state.get_stream() - s = as_bindings_Stream(cs) - core_s = as_core_Stream(cs) - - A_d = core.DeviceMemoryResource(dev_id).allocate(A_h.nbytes, core_s) - B_d = core.DeviceMemoryResource(dev_id).allocate(B_h.nbytes, core_s) - C_d = core.DeviceMemoryResource(dev_id).allocate(C_h.nbytes, core_s) - D_d = core.DeviceMemoryResource(dev_id).allocate(D_h.nbytes, core_s) - - driver.cuMemcpyAsync(A_d.handle, A_h.ctypes.data, A_h.nbytes, s) - driver.cuMemcpyAsync(B_d.handle, B_h.ctypes.data, B_h.nbytes, s) - driver.cuMemcpyAsync(C_d.handle, C_h.ctypes.data, C_h.nbytes, s) - driver.cuMemcpyAsync(D_d.handle, D_h.ctypes.data, D_h.nbytes, s) - - A_cp = make_cp_array(A_h, A_d, dev_id) - B_cp = make_cp_array(B_h, B_d, dev_id) - C_cp = make_cp_array(C_h, C_d, dev_id) - D_cp = make_cp_array(D_h, D_d, dev_id) - - plan = cutlass.op.Gemm( - A=A_cp, - B=B_cp, - C=C_cp, - D=D_cp, - element=dt, - alpha=alpha, - beta=1, - layout=cutlass.LayoutType.RowMajor, - ) - # warm-up to ensure compilation is not timed - plan.run(stream=s) - - def launcher(launch: bench.Launch) -> None: - s = as_bindings_Stream(launch.get_stream()) - plan.run(stream=s, sync=False) - - state.exec(launcher) - - -if __name__ == "__main__": - gemm_b = bench.register(cutlass_gemm) - gemm_b.add_int64_axis("R", [16, 64, 256]) - gemm_b.add_int64_axis("N", [256, 512, 1024, 2048]) - - gemm_b.add_float64_axis("alpha", [1e-2]) - - bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/exec_tag_sync.py b/python/examples/exec_tag_sync.py index b9ab5ef..8d57311 100644 --- a/python/examples/exec_tag_sync.py +++ b/python/examples/exec_tag_sync.py @@ -20,7 +20,7 @@ from typing import Optional import cuda.bench as bench import cuda.cccl.headers as headers -import cuda.core.experimental as core +import cuda.core as core def as_core_Stream(cs: bench.CudaStream) -> core.Stream: diff --git a/python/examples/requirements.txt b/python/examples/requirements.txt index 868fcc6..bc17464 100644 --- a/python/examples/requirements.txt +++ b/python/examples/requirements.txt @@ -1,8 +1,9 @@ numpy numba -cupy -nvidia-cutlass -cuda-cccl -cuda-core cuda-bindings +cuda-core numba-cuda +cuda-cccl +cupy +nvidia-cute-dsl[cu13] +torch[cu13] diff --git a/python/examples/skip.py b/python/examples/skip.py index cf7ec90..f720003 100644 --- a/python/examples/skip.py +++ b/python/examples/skip.py @@ -18,7 +18,7 @@ import sys import cuda.bench as bench import cuda.cccl.headers as headers -import cuda.core.experimental as core +import cuda.core as core def as_core_Stream(cs: bench.CudaStream) -> core.Stream: