CUTLASS example added, license headers added, fixes

- Add license header to each example file.
- Fixed broken runs caused by type declarations.
- Fixed hang in throughput.py when --run-once by doing a
  manual warm-up step, like in auto_throughput.py
This commit is contained in:
Oleksandr Pavlyk
2025-07-24 09:33:13 -05:00
parent c136efab65
commit a69a3647b2
10 changed files with 226 additions and 3 deletions

View File

@@ -25,7 +25,7 @@ def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
return cuda.external_stream(cs.addressof())
def make_kernel(items_per_thread: int) -> cuda.compiler.AutoJitCUDAKernel:
def make_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher:
@cuda.jit
def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
tid = cuda.grid(1)
@@ -59,6 +59,13 @@ def throughput_bench(state: nvbench.State) -> None:
krn = make_kernel(ipt)
# warm-up call ensures that kernel is loaded into context
# before blocking kernel is launched. Kernel loading may
# cause synchronization to occur.
krn[blocks_in_grid, threads_per_block, alloc_stream, 0](
stride, elements, inp_arr, out_arr
)
def launcher(launch: nvbench.Launch):
exec_stream = as_cuda_Stream(launch.get_stream())
krn[blocks_in_grid, threads_per_block, exec_stream, 0](