From 2507bc226352cd2e628ed705eed8f47074fa357b Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Tue, 1 Jul 2025 14:59:33 -0500 Subject: [PATCH] Add Python example based on C++ example/auto_throughput.cpp --- python/examples/auto_throughput.py | 75 ++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 python/examples/auto_throughput.py diff --git a/python/examples/auto_throughput.py b/python/examples/auto_throughput.py new file mode 100644 index 0000000..4f79217 --- /dev/null +++ b/python/examples/auto_throughput.py @@ -0,0 +1,75 @@ +# Copyright 2025 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 with the LLVM exception +# (the "License"); you may not use this file except in compliance with +# the License. +# +# You may obtain a copy of the License at +# +# http://llvm.org/foundation/relicensing/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + +import cuda.nvbench as nvbench +import numpy as np +from numba import cuda + + +def make_kernel(items_per_thread: int): + @cuda.jit + def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr): + tid = cuda.grid(1) + step = cuda.gridDim.x * cuda.blockDim.x + for i in range(stride * tid, stride * elements, stride * step): + for j in range(items_per_thread): + read_id = (items_per_thread * i + j) % elements + write_id = tid + j * elements + out_arr[write_id] = in_arr[read_id] + + return kernel + + +def throughput_bench(state: nvbench.State): + stride = state.getInt64("Stride") + ipt = state.getInt64("ItemsPerThread") + + nbytes = 128 * 1024 * 1024 + elements = nbytes // np.dtype(np.int32).itemsize + + alloc_stream = cuda.external_stream(state.getStream().addressof()) + inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream) + out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream) + + state.addElementCount(elements, "Elements") + state.collectCUPTIMetrics() + + threads_per_block = 256 + blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block + + krn = make_kernel(ipt) + + def launcher(launch: nvbench.Launch): + exec_stream = cuda.external_stream(launch.getStream().addressof()) + krn[blocks_in_grid, threads_per_block, exec_stream, 0]( + stride, elements, inp_arr, out_arr + ) + + state.exec(launcher) + + +( + nvbench.register(throughput_bench) + .addInt64Axis("Stride", [1, 4]) + .addInt64Axis("ItemsPerThread", [1, 2, 3, 4]) +) + + +if __name__ == "__main__": + print(nvbench.__version__) + nvbench.run_all_benchmarks(sys.argv)