From 453a1648aa768137b9026183419cdcfcbb2acef2 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 31 Jul 2025 16:20:52 -0500 Subject: [PATCH] Improvements to readability of examples per PR review --- python/examples/auto_throughput.py | 10 ++++----- python/examples/axes.py | 2 +- .../cccl_parallel_segmented_reduce.py | 6 ++--- python/examples/cupy_extract.py | 10 +++++---- python/examples/throughput.py | 10 ++++----- python/test/run_1.py | 22 +++++++++---------- 6 files changed, 30 insertions(+), 30 deletions(-) diff --git a/python/examples/auto_throughput.py b/python/examples/auto_throughput.py index 80a94e9..1b6e663 100644 --- a/python/examples/auto_throughput.py +++ b/python/examples/auto_throughput.py @@ -21,11 +21,11 @@ import numpy as np from numba import cuda -def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream: +def as_cuda_stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream: return cuda.external_stream(cs.addressof()) -def make_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher: +def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher: @cuda.jit def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr): tid = cuda.grid(1) @@ -46,7 +46,7 @@ def throughput_bench(state: nvbench.State) -> None: nbytes = 128 * 1024 * 1024 elements = nbytes // np.dtype(np.int32).itemsize - alloc_stream = as_cuda_Stream(state.get_stream()) + alloc_stream = as_cuda_stream(state.get_stream()) inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream) out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream) @@ -56,7 +56,7 @@ def throughput_bench(state: nvbench.State) -> None: threads_per_block = 256 blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block - krn = make_kernel(ipt) + krn = make_throughput_kernel(ipt) # warm-up call ensures that kernel is loaded into context # before blocking kernel is launched. Kernel loading may cause @@ -66,7 +66,7 @@ def throughput_bench(state: nvbench.State) -> None: ) def launcher(launch: nvbench.Launch): - exec_stream = as_cuda_Stream(launch.get_stream()) + exec_stream = as_cuda_stream(launch.get_stream()) krn[blocks_in_grid, threads_per_block, exec_stream, 0]( stride, elements, inp_arr, out_arr ) diff --git a/python/examples/axes.py b/python/examples/axes.py index f01607b..e07606f 100644 --- a/python/examples/axes.py +++ b/python/examples/axes.py @@ -193,7 +193,7 @@ if __name__ == "__main__": nvbench.register(default_value) # specify axis nvbench.register(single_float64_axis).add_float64_axis( - "Duration", [7e-5, 1e-4, 5e-4] + "Duration (s)", [7e-5, 1e-4, 5e-4] ) copy1_bench = nvbench.register(copy_sweep_grid_shape) diff --git a/python/examples/cccl_parallel_segmented_reduce.py b/python/examples/cccl_parallel_segmented_reduce.py index 58586be..0f440e3 100644 --- a/python/examples/cccl_parallel_segmented_reduce.py +++ b/python/examples/cccl_parallel_segmented_reduce.py @@ -61,10 +61,6 @@ def segmented_reduce(state: nvbench.State): dev_id = state.get_device() cp_stream = as_cp_ExternalStream(state.get_stream(), dev_id) - with cp_stream: - rng = cp.random.default_rng() - mat = rng.integers(low=-31, high=32, dtype=np.int32, size=(n_rows, n_cols)) - def add_op(a, b): return a + b @@ -84,6 +80,8 @@ def segmented_reduce(state: nvbench.State): h_init = np.zeros(tuple(), dtype=np.int32) with cp_stream: + rng = cp.random.default_rng() + mat = rng.integers(low=-31, high=32, dtype=np.int32, size=(n_rows, n_cols)) d_input = mat d_output = cp.empty(n_rows, dtype=d_input.dtype) diff --git a/python/examples/cupy_extract.py b/python/examples/cupy_extract.py index 59177bc..16e5d9f 100644 --- a/python/examples/cupy_extract.py +++ b/python/examples/cupy_extract.py @@ -36,14 +36,16 @@ def cupy_extract_by_mask(state: nvbench.State): state.collect_cupti_metrics() state.add_element_count(n_rows * n_cols, "# Elements") + int32_dt = cp.dtype(cp.int32) + bool_dt = cp.dtype(cp.bool_) state.add_global_memory_reads( - n_rows * n_cols * (cp.dtype(cp.int32).itemsize + cp.dtype("?").itemsize) + n_rows * n_cols * (int32_dt.itemsize + bool_dt.itemsize) ) - state.add_global_memory_writes(n_rows * n_cols * (cp.dtype(cp.int32).itemsize)) + state.add_global_memory_writes(n_rows * n_cols * (int32_dt.itemsize)) with cp_s: - X = cp.full((n_cols, n_rows), fill_value=3, dtype=cp.int32) - mask = cp.ones((n_cols, n_rows), dtype="?") + X = cp.full((n_cols, n_rows), fill_value=3, dtype=int32_dt) + mask = cp.ones((n_cols, n_rows), dtype=bool_dt) _ = X[mask] def launcher(launch: nvbench.Launch): diff --git a/python/examples/throughput.py b/python/examples/throughput.py index 3ae5c1d..5984126 100644 --- a/python/examples/throughput.py +++ b/python/examples/throughput.py @@ -21,11 +21,11 @@ import numpy as np from numba import cuda -def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream: +def as_cuda_stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream: return cuda.external_stream(cs.addressof()) -def make_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher: +def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher: @cuda.jit def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr): tid = cuda.grid(1) @@ -46,7 +46,7 @@ def throughput_bench(state: nvbench.State) -> None: nbytes = 128 * 1024 * 1024 elements = nbytes // np.dtype(np.int32).itemsize - alloc_stream = as_cuda_Stream(state.get_stream()) + alloc_stream = as_cuda_stream(state.get_stream()) inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream) out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream) @@ -57,7 +57,7 @@ def throughput_bench(state: nvbench.State) -> None: threads_per_block = 256 blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block - krn = make_kernel(ipt) + krn = make_throughput_kernel(ipt) # warm-up call ensures that kernel is loaded into context # before blocking kernel is launched. Kernel loading may @@ -67,7 +67,7 @@ def throughput_bench(state: nvbench.State) -> None: ) def launcher(launch: nvbench.Launch): - exec_stream = as_cuda_Stream(launch.get_stream()) + exec_stream = as_cuda_stream(launch.get_stream()) krn[blocks_in_grid, threads_per_block, exec_stream, 0]( stride, elements, inp_arr, out_arr ) diff --git a/python/test/run_1.py b/python/test/run_1.py index 0099dc9..dfa38f4 100755 --- a/python/test/run_1.py +++ b/python/test/run_1.py @@ -14,12 +14,15 @@ def kernel(a, b, c): c[tid] = a[tid] + b[tid] -def get_numba_stream(launch): +def get_numba_stream(launch: nvbench.Launch): return cuda.external_stream(launch.get_stream().addressof()) -def add_two(state): - # state.skip("Skipping this benchmark for no reason") +def skipit(state: nvbench.State) -> None: + state.skip("Skipping this benchmark for no reason") + + +def add_two(state: nvbench.State): N = state.get_int64("elements") a = cuda.to_device(np.random.random(N)) c = cuda.device_array_like(a) @@ -44,7 +47,7 @@ def add_two(state): state.exec(kernel_launcher, batched=True, sync=True) -def add_float(state): +def add_float(state: nvbench.State): N = state.get_int64("elements") v = state.get_float64("v") name = state.get_string("name") @@ -75,7 +78,7 @@ def add_float(state): state.exec(kernel_launcher, batched=True, sync=True) -def add_three(state): +def add_three(state: nvbench.State): N = state.get_int64("elements") a = cuda.to_device(np.random.random(N).astype(np.float32)) b = cuda.to_device(np.random.random(N).astype(np.float32)) @@ -105,13 +108,10 @@ def register_benchmarks(): nvbench.register(add_float) .add_float64_axis("v", [0.1, 0.3]) .add_string_axis("name", ["Anne", "Lynda"]) - .add_int64_axis("elements", [2**pow2 for pow2 in range(20, 23)]) - ) - ( - nvbench.register(add_three).add_int64_axis( - "elements", [2**pow2 for pow2 in range(20, 22)] - ) + .add_int64_power_of_two_axis("elements", range(20, 23)) ) + (nvbench.register(add_three).add_int64_power_of_two_axis("elements", range(20, 22))) + nvbench.register(skipit) if __name__ == "__main__":