Add examples/exec_tag_sync.py

This commit is contained in:
Oleksandr Pavlyk
2025-07-02 11:49:32 -05:00
parent 4f15840832
commit 964ec2e1bc

View File

@@ -0,0 +1,70 @@
import ctypes
import sys
from typing import Optional
import cuda.cccl.headers as headers
import cuda.core.experimental as core
import cuda.nvbench as nvbench
def make_fill_kernel(data_type: Optional[str] = None):
src = r"""
#include <cuda/std/cstdint>
#include <cuda/std/cstddef>
/*!
* Naive setting of values in buffer
*/
template <typename T>
__global__ void fill_kernel(T *buf, T v, ::cuda::std::size_t n)
{
const auto init = blockIdx.x * blockDim.x + threadIdx.x;
const auto step = blockDim.x * gridDim.x;
for (auto i = init; i < n; i += step)
{
buf[i] = v;
}
}
"""
incl = headers.get_include_paths()
opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
prog = core.Program(src, code_type="c++", options=opts)
if data_type is None:
data_type = "::cuda::std::int32_t"
instance_name = f"fill_kernel<{data_type}>"
mod = prog.compile("cubin", name_expressions=(instance_name,))
return mod.get_kernel(instance_name)
def synchronizing_bench(state: nvbench.State):
n_values = 64 * 1024 * 1024
n_bytes = n_values * ctypes.sizeof(ctypes.c_int32(0))
dev = core.Device(state.getDevice())
dev.set_current()
alloc_stream = dev.create_stream(state.getStream())
buffer = core.DeviceMemoryResource(dev).allocate(n_bytes, alloc_stream)
state.addElementCount(n_values, "Items")
state.addGlobalMemoryWrites(n_bytes, "Size")
krn = make_fill_kernel()
launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
def launcher(launch: nvbench.Launch):
dev = core.Device()
dev.set_current()
s = dev.create_stream(launch.getStream())
core.launch(s, launch_config, krn, buffer, 0, n_values)
s.sync()
# since launcher contains synchronization point,
# setting sync=True is required to avoid a deadlock
state.exec(launcher, sync=True)
if __name__ == "__main__":
nvbench.register(synchronizing_bench)
nvbench.run_all_benchmarks(sys.argv)