mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-04-19 22:38:52 +00:00
CUTLASS example added, license headers added, fixes
- Add license header to each example file. - Fixed broken runs caused by type declarations. - Fixed hang in throughput.py when --run-once by doing a manual warm-up step, like in auto_throughput.py
This commit is contained in:
@@ -25,7 +25,7 @@ def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
|
||||
return cuda.external_stream(cs.addressof())
|
||||
|
||||
|
||||
def make_kernel(items_per_thread: int) -> cuda.compiler.AutoJitCUDAKernel:
|
||||
def make_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher:
|
||||
@cuda.jit
|
||||
def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
|
||||
tid = cuda.grid(1)
|
||||
@@ -59,6 +59,13 @@ def throughput_bench(state: nvbench.State) -> None:
|
||||
|
||||
krn = make_kernel(ipt)
|
||||
|
||||
# warm-up call ensures that kernel is loaded into context
|
||||
# before blocking kernel is launched. Kernel loading may
|
||||
# cause synchronization to occur.
|
||||
krn[blocks_in_grid, threads_per_block, alloc_stream, 0](
|
||||
stride, elements, inp_arr, out_arr
|
||||
)
|
||||
|
||||
def launcher(launch: nvbench.Launch):
|
||||
exec_stream = as_cuda_Stream(launch.get_stream())
|
||||
krn[blocks_in_grid, threads_per_block, exec_stream, 0](
|
||||
|
||||
Reference in New Issue
Block a user