mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-05-12 17:25:41 +00:00
Merge pull request #331 from oleksandr-pavlyk/update-python-examples
Update python examples
This commit is contained in:
@@ -20,7 +20,7 @@ from typing import Dict, Optional, Tuple
|
||||
|
||||
import cuda.bench as bench
|
||||
import cuda.cccl.headers as headers
|
||||
import cuda.core.experimental as core
|
||||
import cuda.core as core
|
||||
|
||||
|
||||
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
|
||||
|
||||
@@ -19,7 +19,7 @@ import time
|
||||
|
||||
import cuda.bench as bench
|
||||
import cuda.cccl.headers as headers
|
||||
import cuda.core.experimental as core
|
||||
import cuda.core as core
|
||||
|
||||
host_sleep_duration = 0.1
|
||||
|
||||
|
||||
@@ -17,36 +17,20 @@
|
||||
import sys
|
||||
|
||||
import cuda.bench as bench
|
||||
import cuda.cccl.parallel.experimental.algorithms as algorithms
|
||||
import cuda.cccl.parallel.experimental.iterators as iterators
|
||||
import cuda.core.experimental as core
|
||||
import cuda.compute.algorithms as algorithms
|
||||
import cuda.compute.iterators as iterators
|
||||
import cuda.core as core
|
||||
import cupy as cp
|
||||
import numpy as np
|
||||
|
||||
|
||||
class CCCLStream:
|
||||
"Class to work around https://github.com/NVIDIA/cccl/issues/5144"
|
||||
|
||||
def __init__(self, ptr):
|
||||
self._ptr = ptr
|
||||
|
||||
def __cuda_stream__(self):
|
||||
return (0, self._ptr)
|
||||
from cuda.compute import OpKind
|
||||
|
||||
|
||||
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
|
||||
return core.Stream.from_handle(cs.addressof())
|
||||
|
||||
|
||||
def as_cccl_Stream(cs: bench.CudaStream) -> CCCLStream:
|
||||
return CCCLStream(cs.addressof())
|
||||
|
||||
|
||||
def as_cp_ExternalStream(
|
||||
cs: bench.CudaStream, dev_id: int | None = -1
|
||||
) -> cp.cuda.ExternalStream:
|
||||
h = cs.addressof()
|
||||
return cp.cuda.ExternalStream(h, dev_id)
|
||||
def as_cp_ExternalStream(cs: bench.CudaStream) -> cp.cuda.ExternalStream:
|
||||
return cp.cuda.Stream.from_external(cs)
|
||||
|
||||
|
||||
def segmented_reduce(state: bench.State):
|
||||
@@ -56,13 +40,8 @@ def segmented_reduce(state: bench.State):
|
||||
n_rows = n_elems // n_cols
|
||||
|
||||
state.add_summary("numRows", n_rows)
|
||||
state.collect_cupti_metrics()
|
||||
|
||||
dev_id = state.get_device()
|
||||
cp_stream = as_cp_ExternalStream(state.get_stream(), dev_id)
|
||||
|
||||
def add_op(a, b):
|
||||
return a + b
|
||||
cp_stream = as_cp_ExternalStream(state.get_stream())
|
||||
|
||||
def make_scaler(step):
|
||||
def scale(row_id):
|
||||
@@ -85,15 +64,24 @@ def segmented_reduce(state: bench.State):
|
||||
d_input = mat
|
||||
d_output = cp.empty(n_rows, dtype=d_input.dtype)
|
||||
|
||||
alg = algorithms.segmented_reduce(
|
||||
add_op = OpKind.PLUS
|
||||
|
||||
alg = algorithms.make_segmented_reduce(
|
||||
d_input, d_output, start_offsets, end_offsets, add_op, h_init
|
||||
)
|
||||
|
||||
cccl_stream = as_cccl_Stream(state.get_stream())
|
||||
|
||||
cccl_stream = state.get_stream()
|
||||
# query size of temporary storage and allocate
|
||||
temp_nbytes = alg(
|
||||
None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init, cccl_stream
|
||||
None,
|
||||
d_input,
|
||||
d_output,
|
||||
add_op,
|
||||
n_rows,
|
||||
start_offsets,
|
||||
end_offsets,
|
||||
h_init,
|
||||
cccl_stream,
|
||||
)
|
||||
h_init = np.zeros(tuple(), dtype=np.int32)
|
||||
|
||||
@@ -101,11 +89,12 @@ def segmented_reduce(state: bench.State):
|
||||
temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
|
||||
|
||||
def launcher(launch: bench.Launch):
|
||||
s = as_cccl_Stream(launch.get_stream())
|
||||
s = launch.get_stream()
|
||||
alg(
|
||||
temp_storage,
|
||||
d_input,
|
||||
d_output,
|
||||
add_op,
|
||||
n_rows,
|
||||
start_offsets,
|
||||
end_offsets,
|
||||
@@ -17,7 +17,7 @@
|
||||
import sys
|
||||
|
||||
import cuda.bench as bench
|
||||
import cuda.cccl.cooperative.experimental as coop
|
||||
import cuda.coop as coop
|
||||
import numba
|
||||
import numpy as np
|
||||
from numba import cuda
|
||||
@@ -20,21 +20,16 @@ import cuda.bench as bench
|
||||
import cupy as cp
|
||||
|
||||
|
||||
def as_cp_ExternalStream(
|
||||
cs: bench.CudaStream, dev_id: int | None = -1
|
||||
) -> cp.cuda.ExternalStream:
|
||||
h = cs.addressof()
|
||||
return cp.cuda.ExternalStream(h, dev_id)
|
||||
def as_cp_ExternalStream(cs: bench.CudaStream):
|
||||
return cp.cuda.Stream.from_external(cs)
|
||||
|
||||
|
||||
def cupy_extract_by_mask(state: bench.State):
|
||||
n_cols = state.get_int64("numCols")
|
||||
n_rows = state.get_int64("numRows")
|
||||
|
||||
dev_id = state.get_device()
|
||||
cp_s = as_cp_ExternalStream(state.get_stream(), dev_id)
|
||||
cp_s = as_cp_ExternalStream(state.get_stream())
|
||||
|
||||
state.collect_cupti_metrics()
|
||||
state.add_element_count(n_rows * n_cols, "# Elements")
|
||||
int32_dt = cp.dtype(cp.int32)
|
||||
bool_dt = cp.dtype(cp.bool_)
|
||||
@@ -49,7 +44,7 @@ def cupy_extract_by_mask(state: bench.State):
|
||||
_ = X[mask]
|
||||
|
||||
def launcher(launch: bench.Launch):
|
||||
with as_cp_ExternalStream(launch.get_stream(), dev_id):
|
||||
with as_cp_ExternalStream(launch.get_stream()):
|
||||
_ = X[mask]
|
||||
|
||||
state.exec(launcher, sync=True)
|
||||
|
||||
667
python/examples/cute_dsl_sgemm.py
Normal file
667
python/examples/cute_dsl_sgemm.py
Normal file
@@ -0,0 +1,667 @@
|
||||
# Copyright 2025 NVIDIA Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License.
|
||||
#
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import sys
|
||||
from typing import Tuple
|
||||
|
||||
import cuda.bench as bench
|
||||
import cuda.bindings.driver as driver
|
||||
import cuda.core as core
|
||||
import cupy as cp
|
||||
import cutlass
|
||||
import cutlass.cute as cute
|
||||
import cutlass.pipeline as pipeline
|
||||
import cutlass.utils as utils
|
||||
import numpy as np
|
||||
from cutlass.cute.runtime import from_dlpack
|
||||
|
||||
|
||||
def as_bindings_Stream(cs: bench.CudaStream) -> driver.CUstream:
|
||||
return driver.CUstream(cs.addressof())
|
||||
|
||||
|
||||
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
|
||||
return core.Stream.from_handle(cs.addressof())
|
||||
|
||||
|
||||
def make_tensor(arr_h: np.ndarray, dev_buf: core.Buffer, dev_id: int | None):
|
||||
cp_memview = cp.cuda.UnownedMemory(
|
||||
int(dev_buf.handle), dev_buf.size, dev_buf, -1 if dev_id is None else dev_id
|
||||
)
|
||||
zero_offset = 0
|
||||
return from_dlpack(
|
||||
cp.ndarray(
|
||||
arr_h.shape,
|
||||
dtype=arr_h.dtype,
|
||||
memptr=cp.cuda.MemoryPointer(cp_memview, zero_offset),
|
||||
),
|
||||
assumed_align=16,
|
||||
)
|
||||
|
||||
|
||||
class SGemm:
|
||||
"""
|
||||
Adapted from https://github.com/NVIDIA/cutlass/blob/main/examples/python/CuTeDSL/ampere/sgemm.py
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cta_tiler: Tuple[int, int, int] = (128, 128, 8),
|
||||
num_stages: int = 3,
|
||||
num_threads: int = 256,
|
||||
):
|
||||
self._cta_tiler = cta_tiler
|
||||
self._num_stages = num_stages
|
||||
self._num_threads = num_threads
|
||||
assert num_threads > 0, "needs at least one thread"
|
||||
assert num_threads % 16 == 0, "multiples of 16 required for MMA thread layout"
|
||||
|
||||
self._bM, self._bN, self._bK = self._cta_tiler
|
||||
assert self._bM % 16 == 0, "multiple of 16 required for tile dimension M"
|
||||
assert self._bN % 16 == 0, "multiple of 16 required for tile dimension N"
|
||||
assert self._num_stages >= 3, "num_stages must be greater than or equal to 3"
|
||||
self.cta_sync_barrier = pipeline.NamedBarrier(
|
||||
barrier_id=1, num_threads=num_threads
|
||||
)
|
||||
|
||||
@cute.jit
|
||||
def __call__(
|
||||
self,
|
||||
mA: cute.Tensor,
|
||||
mB: cute.Tensor,
|
||||
mC: cute.Tensor,
|
||||
epilogue_op: cutlass.Constexpr = lambda x: x,
|
||||
stream: driver.CUstream = driver.CUstream(
|
||||
driver.CUstream_flags.CU_STREAM_DEFAULT
|
||||
),
|
||||
):
|
||||
self.a_major_mode = utils.LayoutEnum.from_tensor(mA)
|
||||
self.b_major_mode = utils.LayoutEnum.from_tensor(mB)
|
||||
self.c_major_mode = utils.LayoutEnum.from_tensor(mC)
|
||||
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
# Create layouts for shared memory for A and B:
|
||||
# - sA/sB is m/n-major to vectorized copies from shared
|
||||
# memory to registers. This is because the MMA layouts
|
||||
# for sA/sB are also m/n-major
|
||||
# - When gA/gB is k-major, pad 4 elements to reduce bank conflicts
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
padding_a = 4 if self.a_major_mode == utils.LayoutEnum.ROW_MAJOR else 0
|
||||
padding_b = 4 if self.b_major_mode == utils.LayoutEnum.ROW_MAJOR else 0
|
||||
sA_layout = cute.make_layout(
|
||||
(self._bM, self._bK, self._num_stages),
|
||||
stride=(1, (self._bM + padding_a), self._bK * (self._bM + padding_a)),
|
||||
)
|
||||
sB_layout = cute.make_layout(
|
||||
(self._bN, self._bK, self._num_stages),
|
||||
stride=(1, (self._bN + padding_b), self._bK * (self._bN + padding_b)),
|
||||
)
|
||||
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
# Create copy layouts that will be used for asynchronous
|
||||
# global memory -> shared memory copies:
|
||||
# - The majorness of tA/tB follows the majorness of gA/gB
|
||||
# - For k-major, these layouts will copy values one-by-one from
|
||||
# from global memory, without vectorizing
|
||||
# - For m/n-major, it will vectorize to a 128bit copy for faster
|
||||
# data transfer between global and shared memory, as long
|
||||
# as the alignment of the tensor allows it. Otherwise, it
|
||||
# defaults to a non-vectorized copy
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
tA = cute.make_layout(
|
||||
(self._num_threads // self._bK, self._bK), stride=(self._bK, 1)
|
||||
)
|
||||
tB = cute.make_layout(
|
||||
(self._num_threads // self._bK, self._bK), stride=(self._bK, 1)
|
||||
)
|
||||
vA = cute.make_layout((1, 1))
|
||||
vB = cute.make_layout((1, 1))
|
||||
atom_async_copy_A = cute.make_copy_atom(
|
||||
cute.nvgpu.cpasync.CopyG2SOp(),
|
||||
mA.element_type,
|
||||
num_bits_per_copy=mA.element_type.width,
|
||||
)
|
||||
atom_async_copy_B = cute.make_copy_atom(
|
||||
cute.nvgpu.cpasync.CopyG2SOp(),
|
||||
mA.element_type,
|
||||
num_bits_per_copy=mB.element_type.width,
|
||||
)
|
||||
if cutlass.const_expr(self.a_major_mode == utils.LayoutEnum.COL_MAJOR):
|
||||
num_vectorized = 4 if (mA.layout[0].max_alignment % 16 == 0) else 1
|
||||
atom_async_copy_A = cute.make_copy_atom(
|
||||
cute.nvgpu.cpasync.CopyG2SOp(),
|
||||
mA.element_type,
|
||||
num_bits_per_copy=mA.element_type.width * num_vectorized,
|
||||
)
|
||||
major_mode_size = self._bM // num_vectorized
|
||||
tA = cute.make_layout(
|
||||
(major_mode_size, self._num_threads // major_mode_size),
|
||||
stride=(1, major_mode_size),
|
||||
)
|
||||
vA = cute.make_layout((num_vectorized, 1))
|
||||
|
||||
if cutlass.const_expr(self.b_major_mode == utils.LayoutEnum.COL_MAJOR):
|
||||
num_vectorized = 4 if (mB.layout[0].max_alignment % 16 == 0) else 1
|
||||
atom_async_copy_B = cute.make_copy_atom(
|
||||
cute.nvgpu.cpasync.CopyG2SOp(),
|
||||
mA.element_type,
|
||||
num_bits_per_copy=mB.element_type.width * num_vectorized,
|
||||
)
|
||||
major_mode_size = self._bN // num_vectorized
|
||||
tB = cute.make_layout(
|
||||
(major_mode_size, self._num_threads // major_mode_size),
|
||||
stride=(1, major_mode_size),
|
||||
)
|
||||
vB = cute.make_layout((num_vectorized, 1))
|
||||
|
||||
tiled_copy_A = cute.make_tiled_copy_tv(atom_async_copy_A, tA, vA)
|
||||
tiled_copy_B = cute.make_tiled_copy_tv(atom_async_copy_B, tB, vB)
|
||||
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
# Create layouts for GEMM:
|
||||
# We tile an MMA atom across a tensor. `atoms_layout` is the layout
|
||||
# of atoms in the tiled MMA. (Because we use an `MmaUniversalOp`,
|
||||
# which has a trivial 1x1x1 MMA trait, `atoms_layout` is also
|
||||
# simply the thread layout for C.) `permutation_tiler` reorders the
|
||||
# elements of the tensor that the tiled MMA is applied to.
|
||||
# Different combinations of `atoms_layout` and `permutation_tiler`
|
||||
# values can create different MMA thread-value patterns.
|
||||
#
|
||||
# Here, the MMA layout is set so that each thread copies four
|
||||
# consecutive elements from shared memory to registers.
|
||||
# `permutation_tiler_M/N` maps the elements handled by each thread
|
||||
# to the permuted element in the tensor.
|
||||
# For increasing indices in the tensor, the thread ID that reads it is:
|
||||
# - (without permutation) ==>
|
||||
# 0 1 2 ... 15 0 1 2 ... 15 0 1 2 ... 15 0 1 2 ... 15 ......
|
||||
# - (with permutation) ==>
|
||||
# 0 0 0 0 1 1 1 1 2 2 2 2 ... 15 15 15 15 0 0 0 0 1 1 1 1 ......
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
atoms_layout = cute.make_layout(
|
||||
(self._num_threads // 16, 16, 1), stride=(16, 1, 0)
|
||||
)
|
||||
if cutlass.const_expr(self.c_major_mode == utils.LayoutEnum.COL_MAJOR):
|
||||
atoms_layout = cute.make_layout(
|
||||
(16, self._num_threads // 16, 1), stride=(1, 16, 0)
|
||||
)
|
||||
op = cute.nvgpu.MmaUniversalOp(cutlass.Float32)
|
||||
permutation_tiler_M = cute.make_layout(
|
||||
(atoms_layout.shape[0], 4), stride=(4, 1)
|
||||
)
|
||||
permutation_tiler_N = cute.make_layout(
|
||||
(atoms_layout.shape[1], 4), stride=(4, 1)
|
||||
)
|
||||
tiled_mma = cute.make_tiled_mma(
|
||||
op,
|
||||
atoms_layout,
|
||||
permutation_mnk=(permutation_tiler_M, permutation_tiler_N, None),
|
||||
)
|
||||
|
||||
# grid_dim: ((m + BLK_M - 1) // BLK_M, (n + BLK_N - 1) // BLK_N, 1)
|
||||
grid_dim = *cute.ceil_div(mC.shape, (self._bM, self._bN)), 1
|
||||
|
||||
self.kernel(
|
||||
mA,
|
||||
mB,
|
||||
mC,
|
||||
sA_layout,
|
||||
sB_layout,
|
||||
tiled_copy_A,
|
||||
tiled_copy_B,
|
||||
tiled_mma,
|
||||
epilogue_op,
|
||||
).launch(
|
||||
grid=grid_dim,
|
||||
block=[cute.size(atoms_layout), 1, 1],
|
||||
stream=stream,
|
||||
)
|
||||
|
||||
@cute.kernel
|
||||
def kernel(
|
||||
self,
|
||||
mA: cute.Tensor,
|
||||
mB: cute.Tensor,
|
||||
mC: cute.Tensor,
|
||||
sA_layout: cute.Layout,
|
||||
sB_layout: cute.Layout,
|
||||
tiled_copy_A: cute.TiledCopy,
|
||||
tiled_copy_B: cute.TiledCopy,
|
||||
tiled_mma: cute.TiledMma,
|
||||
epilogue_op: cutlass.Constexpr = lambda x: x,
|
||||
):
|
||||
# Thread and block indices
|
||||
tidx, tidy, tidz = cute.arch.thread_idx()
|
||||
bidx, bidy, bidz = cute.arch.block_idx()
|
||||
tiler_coord = (bidx, bidy, None)
|
||||
thr_mma = tiled_mma.get_slice(tidx)
|
||||
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
# Get the appropriate tiles for this thread block.
|
||||
# gA: (BLK_M, BLK_K, k), gB: (BLK_N, BLK_K, k), gC: (BLK_M, BLK_N)
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
gA = cute.local_tile(
|
||||
mA, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, None, 1)
|
||||
)
|
||||
gB = cute.local_tile(
|
||||
mB, tiler=self._cta_tiler, coord=tiler_coord, proj=(None, 1, 1)
|
||||
)
|
||||
gC = cute.local_tile(
|
||||
mC, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, 1, None)
|
||||
)
|
||||
|
||||
# Move the pointer of gA/gB in the `-k`` direction, making the first
|
||||
# tile (instead of the last one) irregular in shape when k is irregular.
|
||||
# We first handle the irregular tile to avoid checking for this
|
||||
# condition within the mainloop.
|
||||
residue_k = mA.shape[1] - self._bK * gA.shape[2]
|
||||
gA = cute.domain_offset((0, residue_k, 0), gA)
|
||||
gB = cute.domain_offset((0, residue_k, 0), gB)
|
||||
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
# Get the appropriate tiles for this thread.
|
||||
# sA: (BLK_M, BLK_K, PIPE) , sB: (BLK_N, BLK_K, PIPE)
|
||||
# tAgA: (CPY, CPY_M, CPY_K, k) , tBgB: (CPY, CPY_N, CPY_K, k)
|
||||
# tAsA: (CPY, CPY_M, CPY_K, PIPE) , tBsB: (CPY, CPY_N, CPY_K, PIPE)
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
# Create shared memory buffer
|
||||
smem = cutlass.utils.SmemAllocator()
|
||||
sA = smem.allocate_tensor(mA.element_type, sA_layout, 16)
|
||||
sB = smem.allocate_tensor(mB.element_type, sB_layout, 16)
|
||||
thr_copy_A = tiled_copy_A.get_slice(tidx)
|
||||
thr_copy_B = tiled_copy_B.get_slice(tidx)
|
||||
tAgA = thr_copy_A.partition_S(gA)
|
||||
tAsA = thr_copy_A.partition_D(sA)
|
||||
tBgB = thr_copy_B.partition_S(gB)
|
||||
tBsB = thr_copy_B.partition_D(sB)
|
||||
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
# Predicate: Mark indices that need to copy when the problem shape
|
||||
# isn't a multiple of the tile shape. If tApA/B[i] is 0, then do not
|
||||
# do the copy atom associated with index i.
|
||||
# cA: (BLK_M, BLK_K) => (blk_m, blk_k)
|
||||
# cB: (BLK_N, BLK_K) => (blk_n, blk_k)
|
||||
# tAcA: (CPY, CPY_M, CPY_K) => (blk_m, blk_k)
|
||||
# tBcB: (CPY, CPY_N, CPY_K) => (blk_n, blk_k)
|
||||
# tApA: (rest_v, CPY_M, CPY_K), stride=(..., ..., 0)
|
||||
# tBpB: (rest_v, CPY_N, CPY_K), stride=(..., ..., 0)
|
||||
# CPY = (atom_v, rest_v)
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
# Construct identity layout for sA and sB, used for predication
|
||||
mcA = cute.make_identity_tensor(mA.shape)
|
||||
mcB = cute.make_identity_tensor(mB.shape)
|
||||
cA = cute.local_tile(
|
||||
mcA, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, None, 1)
|
||||
)
|
||||
cB = cute.local_tile(
|
||||
mcB, tiler=self._cta_tiler, coord=tiler_coord, proj=(None, 1, 1)
|
||||
)
|
||||
cA = cute.domain_offset((0, residue_k, 0), cA)
|
||||
cB = cute.domain_offset((0, residue_k, 0), cB)
|
||||
# Repeat the partitioning with identity layouts
|
||||
tAcA = thr_copy_A.partition_S(cA)
|
||||
tBcB = thr_copy_B.partition_S(cB)
|
||||
# Allocate predicate tensors for m and n
|
||||
tApA = cute.make_rmem_tensor(
|
||||
cute.make_layout(
|
||||
(
|
||||
tAsA.shape[0][1],
|
||||
cute.size(tAsA, mode=[1]),
|
||||
cute.size(tAsA, mode=[2]),
|
||||
),
|
||||
stride=(cute.size(tAsA, mode=[1]), 1, 0),
|
||||
),
|
||||
cutlass.Boolean,
|
||||
)
|
||||
tBpB = cute.make_rmem_tensor(
|
||||
cute.make_layout(
|
||||
(
|
||||
tBsB.shape[0][1],
|
||||
cute.size(tBsB, mode=[1]),
|
||||
cute.size(tBsB, mode=[2]),
|
||||
),
|
||||
stride=(cute.size(tBsB, mode=[1]), 1, 0),
|
||||
),
|
||||
cutlass.Boolean,
|
||||
)
|
||||
# Allocate predicate tensors for m, n and k for residue k-tile
|
||||
tApA_residue_k = cute.make_rmem_tensor(
|
||||
cute.make_layout(
|
||||
(
|
||||
tAsA.shape[0][1],
|
||||
cute.size(tAsA, mode=[1]),
|
||||
cute.size(tAsA, mode=[2]),
|
||||
),
|
||||
stride=(
|
||||
cute.size(tAsA, mode=[1]) * cute.size(tAsA, mode=[2]),
|
||||
cute.size(tAsA, mode=[2]),
|
||||
1,
|
||||
),
|
||||
),
|
||||
cutlass.Boolean,
|
||||
)
|
||||
tBpB_residue_k = cute.make_rmem_tensor(
|
||||
cute.make_layout(
|
||||
(
|
||||
tBsB.shape[0][1],
|
||||
cute.size(tBsB, mode=[1]),
|
||||
cute.size(tBsB, mode=[2]),
|
||||
),
|
||||
stride=(
|
||||
cute.size(tBsB, mode=[1]) * cute.size(tBsB, mode=[2]),
|
||||
cute.size(tBsB, mode=[2]),
|
||||
1,
|
||||
),
|
||||
),
|
||||
cutlass.Boolean,
|
||||
)
|
||||
# Set predicates for m/n bounds for mainloop
|
||||
for rest_v in range(tApA.shape[0]):
|
||||
for m in range(tApA.shape[1]):
|
||||
tApA[rest_v, m, 0] = cute.elem_less(
|
||||
tAcA[(0, rest_v), m, 0, 0][0], mA.shape[0]
|
||||
)
|
||||
for rest_v in range(tBpB.shape[0]):
|
||||
for n in range(tBpB.shape[1]):
|
||||
tBpB[rest_v, n, 0] = cute.elem_less(
|
||||
tBcB[(0, rest_v), n, 0, 0][0], mB.shape[0]
|
||||
)
|
||||
|
||||
# Set predicates for m/n/k bounds for residue k tile
|
||||
for rest_v in range(tApA_residue_k.shape[0]):
|
||||
for m in range(tApA_residue_k.shape[1]):
|
||||
for k in range(tApA_residue_k.shape[2]):
|
||||
coord_A = tAcA[(0, rest_v), m, k, 0]
|
||||
tApA_residue_k[rest_v, m, k] = cute.elem_less(
|
||||
(coord_A[0], cutlass.Int32(-1)), (mA.shape[0], coord_A[1])
|
||||
)
|
||||
for rest_v in range(tBpB_residue_k.shape[0]):
|
||||
for n in range(tBpB_residue_k.shape[1]):
|
||||
for k in range(tBpB_residue_k.shape[2]):
|
||||
coord_B = tBcB[(0, rest_v), n, k, 0]
|
||||
tBpB_residue_k[rest_v, n, k] = cute.elem_less(
|
||||
(coord_B[0], cutlass.Int32(-1)), (mB.shape[0], coord_B[1])
|
||||
)
|
||||
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
# Prefetch Prologue
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
# Start async loads for 0th k-tile, where we take care of the k-residue
|
||||
k_pipe_max = cute.size(tAsA, mode=[3])
|
||||
k_tile_count = cute.size(tAgA, mode=[3])
|
||||
gmem_pipe_read = cutlass.Int32(0)
|
||||
cute.copy(
|
||||
tiled_copy_A,
|
||||
tAgA[None, None, None, gmem_pipe_read],
|
||||
tAsA[None, None, None, 0],
|
||||
pred=tApA_residue_k,
|
||||
)
|
||||
cute.copy(
|
||||
tiled_copy_B,
|
||||
tBgB[None, None, None, gmem_pipe_read],
|
||||
tBsB[None, None, None, 0],
|
||||
pred=tBpB_residue_k,
|
||||
)
|
||||
cute.arch.cp_async_commit_group()
|
||||
gmem_pipe_read = (
|
||||
gmem_pipe_read + 1
|
||||
if gmem_pipe_read + 1 < k_tile_count
|
||||
else cutlass.Int32(0)
|
||||
)
|
||||
# Start async loads for 1st k-tile onwards, no k-residue handling needed
|
||||
for k_tile in range(1, k_pipe_max - 1):
|
||||
if k_tile < k_tile_count:
|
||||
cute.copy(
|
||||
tiled_copy_A,
|
||||
tAgA[None, None, None, gmem_pipe_read],
|
||||
tAsA[None, None, None, k_tile],
|
||||
pred=tApA,
|
||||
)
|
||||
cute.copy(
|
||||
tiled_copy_B,
|
||||
tBgB[None, None, None, gmem_pipe_read],
|
||||
tBsB[None, None, None, k_tile],
|
||||
pred=tBpB,
|
||||
)
|
||||
|
||||
gmem_pipe_read = (
|
||||
gmem_pipe_read + 1
|
||||
if gmem_pipe_read + 1 < k_tile_count
|
||||
else cutlass.Int32(0)
|
||||
)
|
||||
cute.arch.cp_async_commit_group()
|
||||
|
||||
# all tiles have been copied from global memory, so clear the
|
||||
# predicate tensor
|
||||
if k_tile_count < k_pipe_max:
|
||||
for rest_v in range(tApA.shape[0]):
|
||||
for m in range(tApA.shape[1]):
|
||||
tApA[rest_v, m, 0] = cutlass.Boolean(0)
|
||||
for rest_v in range(tBpB.shape[0]):
|
||||
for n in range(tBpB.shape[1]):
|
||||
tBpB[rest_v, n, 0] = cutlass.Boolean(0)
|
||||
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
# Define A/B partitioning and C accumulators.
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
tCsA = thr_mma.partition_A(sA)
|
||||
tCsB = thr_mma.partition_B(sB)
|
||||
tCgC = thr_mma.partition_C(gC)
|
||||
tCrA = tiled_mma.make_fragment_A(tCsA[None, None, None, 0])
|
||||
tCrB = tiled_mma.make_fragment_B(tCsB[None, None, None, 0])
|
||||
tCrC = tiled_mma.make_fragment_C(tCgC)
|
||||
# Clear the accumulator
|
||||
tCrC.fill(0.0)
|
||||
|
||||
# Current pipe index in smem to read from / write to
|
||||
smem_pipe_read = cutlass.Int32(0)
|
||||
smem_pipe_write = cutlass.Int32(k_pipe_max - 1)
|
||||
|
||||
tCsA_p = tCsA[None, None, None, smem_pipe_read]
|
||||
tCsB_p = tCsB[None, None, None, smem_pipe_read]
|
||||
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
# PREFETCH register pipeline
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
k_block_max = cute.size(tCrA, mode=[2])
|
||||
|
||||
if k_block_max > 1:
|
||||
# Wait until our first prefetched tile is loaded in
|
||||
cute.arch.cp_async_wait_group(k_pipe_max - 2)
|
||||
self.cta_sync_barrier.arrive_and_wait()
|
||||
# Prefetch the first rmem from the first k-tile
|
||||
cute.autovec_copy(tCsA_p[None, None, 0], tCrA[None, None, 0])
|
||||
cute.autovec_copy(tCsB_p[None, None, 0], tCrB[None, None, 0])
|
||||
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
# Mainloop
|
||||
# 1. Shared memory pipeline (gmem -> smem):
|
||||
# The default smem pipeline depth is 3, meaning that for shared
|
||||
# memory buffers, we allocate three times the size described by the
|
||||
# CTA tiler. We prefetch 2 of these buffers before entering the main
|
||||
# loop. Considering only the transfer from global memory to shared
|
||||
# memory, the general structure of the mainloop is:
|
||||
# (1) copy k-tile from gmem to smem;
|
||||
# (2) perform gemm computation on k-tile;
|
||||
# (3) wait for the next copy to finish.
|
||||
# The `cute.arch.cp_async_wait_group(num_smem_stages - 2)` command
|
||||
# waits for the number of unfinished 'copy' to be <= 1. The advantage
|
||||
# of this approach is that it allows for simultaneous production
|
||||
# (i.e., step (1)) and consumption (i.e., step (2)) of smem.
|
||||
# A common misconception is to prefetch N buffers and rewrite
|
||||
# the pipeline logic to wait on N-1 pending copies. The disadvantage
|
||||
# of this approach is that it requires fully consuming a buffer in
|
||||
# order to open an empty buffer for the next copy.
|
||||
# 2. Register pipeline (smem -> register):
|
||||
# Similarly, the register pipeline produces i+1, consumes i, and
|
||||
# produces i+2... Notably, i and i+1 do not use the same register,
|
||||
# eliminating dependencies on the same register for better parallelism.
|
||||
# 3. Combining the smem and register pipelines results in the mainloop.
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
for _ in range(k_tile_count):
|
||||
for k_block in range(k_block_max, unroll_full=True):
|
||||
if k_block == k_block_max - 1:
|
||||
tCsA_p = tCsA[None, None, None, smem_pipe_read]
|
||||
tCsB_p = tCsB[None, None, None, smem_pipe_read]
|
||||
cute.arch.cp_async_wait_group(k_pipe_max - 2)
|
||||
self.cta_sync_barrier.arrive_and_wait()
|
||||
|
||||
# Load A, B from shared memory to registers for k_block + 1
|
||||
k_block_next = (k_block + 1) % k_block_max # static
|
||||
cute.autovec_copy(
|
||||
tCsA_p[None, None, k_block_next],
|
||||
tCrA[None, None, k_block_next],
|
||||
)
|
||||
cute.autovec_copy(
|
||||
tCsB_p[None, None, k_block_next],
|
||||
tCrB[None, None, k_block_next],
|
||||
)
|
||||
|
||||
# Fetch next A: To better interleave global memory access and
|
||||
# compute instructions, we intentionally use the sequence:
|
||||
# copy A, perform GEMM, then copy B.
|
||||
if k_block == 0:
|
||||
cute.copy(
|
||||
tiled_copy_A,
|
||||
tAgA[None, None, None, gmem_pipe_read],
|
||||
tAsA[None, None, None, smem_pipe_write],
|
||||
# Use predicates because the m-mode may be irregular
|
||||
pred=tApA,
|
||||
)
|
||||
|
||||
# Thread-level register gemm for k_block
|
||||
cute.gemm(
|
||||
tiled_mma,
|
||||
tCrC,
|
||||
tCrA[None, None, k_block],
|
||||
tCrB[None, None, k_block],
|
||||
tCrC,
|
||||
)
|
||||
|
||||
# Fetch next B and update smem pipeline read/write
|
||||
if k_block == 0:
|
||||
cute.copy(
|
||||
tiled_copy_B,
|
||||
tBgB[None, None, None, gmem_pipe_read],
|
||||
tBsB[None, None, None, smem_pipe_write],
|
||||
# Use predicates because the n-mode may be irregular
|
||||
pred=tBpB,
|
||||
)
|
||||
cute.arch.cp_async_commit_group()
|
||||
smem_pipe_write = smem_pipe_read
|
||||
smem_pipe_read = smem_pipe_read + 1
|
||||
if smem_pipe_read == k_pipe_max:
|
||||
smem_pipe_read = cutlass.Int32(0)
|
||||
# After copying all tiles, we avoid clearing the predicate
|
||||
# tensor in the `mainloop` to prevent increasing its
|
||||
# instruction count. Instead, we continue copying the
|
||||
# first tile, though it won't be used. The 0-th tile is not
|
||||
# copied due to its irregular shape, which could lead to
|
||||
# illegal memory accesses.
|
||||
gmem_pipe_read = (
|
||||
gmem_pipe_read + 1
|
||||
if gmem_pipe_read + 1 < k_tile_count
|
||||
else cutlass.Int32(1)
|
||||
)
|
||||
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
# Epilogue
|
||||
# Applies the epilogue operation to the accumulated results and copies
|
||||
# them without vectorization.
|
||||
# ///////////////////////////////////////////////////////////////////////////////
|
||||
cute.arch.cp_async_wait_group(0)
|
||||
self.cta_sync_barrier.arrive_and_wait()
|
||||
tCrC.store(epilogue_op(tCrC.load()))
|
||||
|
||||
# predicate
|
||||
cC = cute.make_identity_tensor(gC.shape)
|
||||
tCpC = thr_mma.partition_C(cC)
|
||||
predC = cute.make_rmem_tensor(tCrC.layout, cutlass.Boolean)
|
||||
residue_m = mC.shape[0] - cutlass.Int32(self._bM) * bidx
|
||||
residue_n = mC.shape[1] - cutlass.Int32(self._bN) * bidy
|
||||
for i in range(cute.size(tCrC.shape)):
|
||||
predC[i] = cute.elem_less(tCpC[i], (residue_m, residue_n))
|
||||
atom = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), mC.element_type)
|
||||
cute.copy(atom, tCrC, tCgC, pred=predC)
|
||||
return
|
||||
|
||||
|
||||
def cutlass_gemm(state: bench.State) -> None:
|
||||
n = state.get_int64("N")
|
||||
r = state.get_int64("R")
|
||||
|
||||
dt = np.float32
|
||||
A_h = np.random.randn(n, r).astype(dt)
|
||||
B_h = np.copy(A_h.mT)
|
||||
C_h = np.eye(n, dtype=dt)
|
||||
|
||||
if n >= 1024:
|
||||
# allow more time for large inputs
|
||||
state.set_timeout(360)
|
||||
|
||||
dev_id = state.get_device()
|
||||
cs = state.get_stream()
|
||||
s = as_bindings_Stream(cs)
|
||||
core_s = as_core_Stream(cs)
|
||||
|
||||
A_d = core.DeviceMemoryResource(dev_id).allocate(A_h.nbytes, core_s)
|
||||
B_d = core.DeviceMemoryResource(dev_id).allocate(B_h.nbytes, core_s)
|
||||
C_d = core.DeviceMemoryResource(dev_id).allocate(C_h.nbytes, core_s)
|
||||
|
||||
driver.cuMemcpyAsync(A_d.handle, A_h.ctypes.data, A_h.nbytes, s)
|
||||
driver.cuMemcpyAsync(B_d.handle, B_h.ctypes.data, B_h.nbytes, s)
|
||||
driver.cuMemcpyAsync(C_d.handle, C_h.ctypes.data, C_h.nbytes, s)
|
||||
|
||||
A_cp = make_tensor(A_h, A_d, dev_id)
|
||||
B_cp = make_tensor(B_h, B_d, dev_id)
|
||||
C_cp = make_tensor(C_h, C_d, dev_id)
|
||||
|
||||
sgemm = SGemm()
|
||||
_ = sgemm(A_cp, B_cp, C_cp, stream=s)
|
||||
|
||||
def launcher(launch: bench.Launch) -> None:
|
||||
s = as_bindings_Stream(launch.get_stream())
|
||||
sgemm(A_cp, B_cp, C_cp, stream=s)
|
||||
|
||||
state.exec(launcher)
|
||||
|
||||
|
||||
def patch_cute_dsl():
|
||||
def _no_op_diagnostic(self):
|
||||
return
|
||||
|
||||
try:
|
||||
import cutlass.base_dsl.dsl as dsl_m
|
||||
|
||||
base_dsl_k = dsl_m.BaseDSL
|
||||
if hasattr(base_dsl_k, "diagnostic"):
|
||||
base_dsl_k.diagnostic = _no_op_diagnostic
|
||||
except (ModuleNotFoundError, AttributeError):
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# see https://github.com/NVIDIA/cutlass/issues/3142
|
||||
patch_cute_dsl()
|
||||
|
||||
gemm_b = bench.register(cutlass_gemm)
|
||||
gemm_b.add_int64_axis("R", [16, 64, 256])
|
||||
gemm_b.add_int64_axis("N", [256, 512, 1024, 2048])
|
||||
|
||||
bench.run_all_benchmarks(sys.argv)
|
||||
@@ -1,113 +0,0 @@
|
||||
# Copyright 2025 NVIDIA Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License.
|
||||
#
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import sys
|
||||
|
||||
import cuda.bench as bench
|
||||
import cuda.bindings.driver as driver
|
||||
import cuda.core.experimental as core
|
||||
import cupy as cp
|
||||
import cutlass
|
||||
import numpy as np
|
||||
|
||||
|
||||
def as_bindings_Stream(cs: bench.CudaStream) -> driver.CUstream:
|
||||
return driver.CUstream(cs.addressof())
|
||||
|
||||
|
||||
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
|
||||
return core.Stream.from_handle(cs.addressof())
|
||||
|
||||
|
||||
def make_cp_array(
|
||||
arr_h: np.ndarray, dev_buf: core.Buffer, dev_id: int | None
|
||||
) -> cp.ndarray:
|
||||
cp_memview = cp.cuda.UnownedMemory(
|
||||
int(dev_buf.handle), dev_buf.size, dev_buf, -1 if dev_id is None else dev_id
|
||||
)
|
||||
zero_offset = 0
|
||||
return cp.ndarray(
|
||||
arr_h.shape,
|
||||
dtype=arr_h.dtype,
|
||||
memptr=cp.cuda.MemoryPointer(cp_memview, zero_offset),
|
||||
)
|
||||
|
||||
|
||||
def cutlass_gemm(state: bench.State) -> None:
|
||||
n = state.get_int64("N")
|
||||
r = state.get_int64("R")
|
||||
|
||||
alpha = state.get_float64("alpha")
|
||||
|
||||
dt = np.float64
|
||||
A_h = np.random.randn(n, r).astype(dt)
|
||||
B_h = np.copy(A_h.mT)
|
||||
C_h = np.eye(n, dtype=dt)
|
||||
D_h = np.zeros_like(C_h)
|
||||
|
||||
if n >= 1024:
|
||||
# allow more time for large inputs
|
||||
state.set_timeout(360)
|
||||
|
||||
dev_id = state.get_device()
|
||||
cs = state.get_stream()
|
||||
s = as_bindings_Stream(cs)
|
||||
core_s = as_core_Stream(cs)
|
||||
|
||||
A_d = core.DeviceMemoryResource(dev_id).allocate(A_h.nbytes, core_s)
|
||||
B_d = core.DeviceMemoryResource(dev_id).allocate(B_h.nbytes, core_s)
|
||||
C_d = core.DeviceMemoryResource(dev_id).allocate(C_h.nbytes, core_s)
|
||||
D_d = core.DeviceMemoryResource(dev_id).allocate(D_h.nbytes, core_s)
|
||||
|
||||
driver.cuMemcpyAsync(A_d.handle, A_h.ctypes.data, A_h.nbytes, s)
|
||||
driver.cuMemcpyAsync(B_d.handle, B_h.ctypes.data, B_h.nbytes, s)
|
||||
driver.cuMemcpyAsync(C_d.handle, C_h.ctypes.data, C_h.nbytes, s)
|
||||
driver.cuMemcpyAsync(D_d.handle, D_h.ctypes.data, D_h.nbytes, s)
|
||||
|
||||
A_cp = make_cp_array(A_h, A_d, dev_id)
|
||||
B_cp = make_cp_array(B_h, B_d, dev_id)
|
||||
C_cp = make_cp_array(C_h, C_d, dev_id)
|
||||
D_cp = make_cp_array(D_h, D_d, dev_id)
|
||||
|
||||
plan = cutlass.op.Gemm(
|
||||
A=A_cp,
|
||||
B=B_cp,
|
||||
C=C_cp,
|
||||
D=D_cp,
|
||||
element=dt,
|
||||
alpha=alpha,
|
||||
beta=1,
|
||||
layout=cutlass.LayoutType.RowMajor,
|
||||
)
|
||||
# warm-up to ensure compilation is not timed
|
||||
plan.run(stream=s)
|
||||
|
||||
def launcher(launch: bench.Launch) -> None:
|
||||
s = as_bindings_Stream(launch.get_stream())
|
||||
plan.run(stream=s, sync=False)
|
||||
|
||||
state.exec(launcher)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
gemm_b = bench.register(cutlass_gemm)
|
||||
gemm_b.add_int64_axis("R", [16, 64, 256])
|
||||
gemm_b.add_int64_axis("N", [256, 512, 1024, 2048])
|
||||
|
||||
gemm_b.add_float64_axis("alpha", [1e-2])
|
||||
|
||||
bench.run_all_benchmarks(sys.argv)
|
||||
@@ -20,7 +20,7 @@ from typing import Optional
|
||||
|
||||
import cuda.bench as bench
|
||||
import cuda.cccl.headers as headers
|
||||
import cuda.core.experimental as core
|
||||
import cuda.core as core
|
||||
|
||||
|
||||
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
numpy
|
||||
numba
|
||||
cupy
|
||||
nvidia-cutlass
|
||||
cuda-cccl
|
||||
cuda-core
|
||||
cuda-bindings
|
||||
cuda-core
|
||||
numba-cuda
|
||||
cuda-cccl
|
||||
cupy
|
||||
nvidia-cute-dsl[cu13]
|
||||
torch[cu13]
|
||||
|
||||
@@ -18,7 +18,7 @@ import sys
|
||||
|
||||
import cuda.bench as bench
|
||||
import cuda.cccl.headers as headers
|
||||
import cuda.core.experimental as core
|
||||
import cuda.core as core
|
||||
|
||||
|
||||
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
|
||||
|
||||
Reference in New Issue
Block a user