CUTLASS example added, license headers added, fixes

- Add license header to each example file. - Fixed broken runs caused by type declarations. - Fixed hang in throughput.py when --run-once by doing a manual warm-up step, like in auto_throughput.py
2026-04-19 22:38:52 +00:00 · 2025-07-24 09:33:13 -05:00
parent c136efab65
commit a69a3647b2
10 changed files with 226 additions and 3 deletions
--- a/python/examples/throughput.py
+++ b/python/examples/throughput.py
@@ -25,7 +25,7 @@ def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
    return cuda.external_stream(cs.addressof())


-def make_kernel(items_per_thread: int) -> cuda.compiler.AutoJitCUDAKernel:
+def make_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher:
    @cuda.jit
    def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
        tid = cuda.grid(1)
@@ -59,6 +59,13 @@ def throughput_bench(state: nvbench.State) -> None:

    krn = make_kernel(ipt)

+    # warm-up call ensures that kernel is loaded into context
+    # before blocking kernel is launched. Kernel loading may
+    # cause synchronization to occur.
+    krn[blocks_in_grid, threads_per_block, alloc_stream, 0](
+        stride, elements, inp_arr, out_arr
+    )
+
    def launcher(launch: nvbench.Launch):
        exec_stream = as_cuda_Stream(launch.get_stream())
        krn[blocks_in_grid, threads_per_block, exec_stream, 0](