diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 139b80b..b18f7ef 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -33,9 +33,9 @@ set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN") set_target_properties(_nvbench PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON) set_target_properties(_nvbench PROPERTIES POSITION_INDEPENDENT_CODE ON) -install(TARGETS _nvbench DESTINATION cuda/nvbench) +install(TARGETS _nvbench DESTINATION cuda/bench) # Determine target that nvbench::nvbench is an alias of, # necessary because ALIAS targets cannot be installed get_target_property(_aliased_target_name nvbench::nvbench ALIASED_TARGET) -install(IMPORTED_RUNTIME_ARTIFACTS ${_aliased_target_name} DESTINATION cuda/nvbench) +install(IMPORTED_RUNTIME_ARTIFACTS ${_aliased_target_name} DESTINATION cuda/bench) diff --git a/python/cuda/nvbench/__init__.py b/python/cuda/bench/__init__.py similarity index 80% rename from python/cuda/nvbench/__init__.py rename to python/cuda/bench/__init__.py index 4617e52..e1d2282 100644 --- a/python/cuda/nvbench/__init__.py +++ b/python/cuda/bench/__init__.py @@ -34,25 +34,25 @@ except Exception as e: for libname in ("cupti", "nvperf_target", "nvperf_host"): load_nvidia_dynamic_lib(libname) -from cuda.nvbench._nvbench import ( # noqa: E402 +from cuda.bench._nvbench import ( # noqa: E402 Benchmark as Benchmark, ) -from cuda.nvbench._nvbench import ( # noqa: E402 +from cuda.bench._nvbench import ( # noqa: E402 CudaStream as CudaStream, ) -from cuda.nvbench._nvbench import ( # noqa: E402 +from cuda.bench._nvbench import ( # noqa: E402 Launch as Launch, ) -from cuda.nvbench._nvbench import ( # noqa: E402 +from cuda.bench._nvbench import ( # noqa: E402 NVBenchRuntimeError as NVBenchRuntimeError, ) -from cuda.nvbench._nvbench import ( # noqa: E402 +from cuda.bench._nvbench import ( # noqa: E402 State as State, ) -from cuda.nvbench._nvbench import ( # noqa: E402 +from cuda.bench._nvbench import ( # noqa: E402 register as register, ) -from cuda.nvbench._nvbench import ( # noqa: E402 +from cuda.bench._nvbench import ( # noqa: E402 run_all_benchmarks as run_all_benchmarks, ) diff --git a/python/cuda/nvbench/__init__.pyi b/python/cuda/bench/__init__.pyi similarity index 99% rename from python/cuda/nvbench/__init__.pyi rename to python/cuda/bench/__init__.pyi index a0bca3d..86681fc 100644 --- a/python/cuda/nvbench/__init__.pyi +++ b/python/cuda/bench/__init__.pyi @@ -44,9 +44,9 @@ class CudaStream: Example ------- import cuda.core.experimental as core - import cuda.nvbench as nvbench + import cuda.bench as bench - def bench(state: nvbench.State): + def bench(state: bench.State): dev = core.Device(state.get_device()) dev.set_current() # converts CudaString to core.Stream diff --git a/python/cuda/nvbench/py.typed b/python/cuda/bench/py.typed similarity index 100% rename from python/cuda/nvbench/py.typed rename to python/cuda/bench/py.typed diff --git a/python/examples/auto_throughput.py b/python/examples/auto_throughput.py index 88691ec..db4fa19 100644 --- a/python/examples/auto_throughput.py +++ b/python/examples/auto_throughput.py @@ -16,12 +16,12 @@ import sys -import cuda.nvbench as nvbench +import cuda.bench as bench import numpy as np from numba import cuda -def as_cuda_stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream: +def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream: return cuda.external_stream(cs.addressof()) @@ -39,7 +39,7 @@ def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatc return kernel -def throughput_bench(state: nvbench.State) -> None: +def throughput_bench(state: bench.State) -> None: stride = state.get_int64("Stride") ipt = state.get_int64("ItemsPerThread") @@ -58,7 +58,7 @@ def throughput_bench(state: nvbench.State) -> None: krn = make_throughput_kernel(ipt) - def launcher(launch: nvbench.Launch): + def launcher(launch: bench.Launch): exec_stream = as_cuda_stream(launch.get_stream()) krn[blocks_in_grid, threads_per_block, exec_stream, 0]( stride, elements, inp_arr, out_arr @@ -68,8 +68,8 @@ def throughput_bench(state: nvbench.State) -> None: if __name__ == "__main__": - b = nvbench.register(throughput_bench) + b = bench.register(throughput_bench) b.add_int64_axis("Stride", [1, 2, 4]) b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4]) - nvbench.run_all_benchmarks(sys.argv) + bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/axes.py b/python/examples/axes.py index e07606f..ce67238 100644 --- a/python/examples/axes.py +++ b/python/examples/axes.py @@ -18,12 +18,12 @@ import ctypes import sys from typing import Dict, Optional, Tuple +import cuda.bench as bench import cuda.cccl.headers as headers import cuda.core.experimental as core -import cuda.nvbench as nvbench -def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream: +def as_core_Stream(cs: bench.CudaStream) -> core.Stream: return core.Stream.from_handle(cs.addressof()) @@ -58,34 +58,34 @@ __global__ void sleep_kernel(double seconds) { return mod.get_kernel("sleep_kernel") -def simple(state: nvbench.State): +def simple(state: bench.State): state.set_min_samples(1000) sleep_dur = 1e-3 krn = make_sleep_kernel() launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0) - def launcher(launch: nvbench.Launch): + def launcher(launch: bench.Launch): s = as_core_Stream(launch.get_stream()) core.launch(s, launch_config, krn, sleep_dur) state.exec(launcher) -def single_float64_axis(state: nvbench.State): +def single_float64_axis(state: bench.State): # get axis value, or default default_sleep_dur = 3.14e-4 sleep_dur = state.get_float64_or_default("Duration", default_sleep_dur) krn = make_sleep_kernel() launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0) - def launcher(launch: nvbench.Launch): + def launcher(launch: bench.Launch): s = as_core_Stream(launch.get_stream()) core.launch(s, launch_config, krn, sleep_dur) state.exec(launcher) -def default_value(state: nvbench.State): +def default_value(state: bench.State): single_float64_axis(state) @@ -120,7 +120,7 @@ __global__ void copy_kernel(const T *in, U *out, ::cuda::std::size_t n) return mod.get_kernel(instance_name) -def copy_sweep_grid_shape(state: nvbench.State): +def copy_sweep_grid_shape(state: bench.State): block_size = state.get_int64("BlockSize") num_blocks = state.get_int64("NumBlocks") @@ -140,14 +140,14 @@ def copy_sweep_grid_shape(state: nvbench.State): krn = make_copy_kernel() launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0) - def launcher(launch: nvbench.Launch): + def launcher(launch: bench.Launch): s = as_core_Stream(launch.get_stream()) core.launch(s, launch_config, krn, input_buf, output_buf, num_values) state.exec(launcher) -def copy_type_sweep(state: nvbench.State): +def copy_type_sweep(state: bench.State): type_id = state.get_int64("TypeID") types_map: Dict[int, Tuple[type, str]] = { @@ -178,7 +178,7 @@ def copy_type_sweep(state: nvbench.State): krn = make_copy_kernel(value_cuda_t, value_cuda_t) launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0) - def launcher(launch: nvbench.Launch): + def launcher(launch: bench.Launch): s = as_core_Stream(launch.get_stream()) core.launch(s, launch_config, krn, input_buf, output_buf, num_values) @@ -187,20 +187,20 @@ def copy_type_sweep(state: nvbench.State): if __name__ == "__main__": # Benchmark without axes - nvbench.register(simple) + bench.register(simple) # benchmark with no axes, that uses default value - nvbench.register(default_value) + bench.register(default_value) # specify axis - nvbench.register(single_float64_axis).add_float64_axis( + bench.register(single_float64_axis).add_float64_axis( "Duration (s)", [7e-5, 1e-4, 5e-4] ) - copy1_bench = nvbench.register(copy_sweep_grid_shape) + copy1_bench = bench.register(copy_sweep_grid_shape) copy1_bench.add_int64_axis("BlockSize", [2**x for x in range(6, 10, 2)]) copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)]) - copy2_bench = nvbench.register(copy_type_sweep) + copy2_bench = bench.register(copy_type_sweep) copy2_bench.add_int64_axis("TypeID", range(0, 6)) - nvbench.run_all_benchmarks(sys.argv) + bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/cccl_cooperative_block_reduce.py b/python/examples/cccl_cooperative_block_reduce.py index dc9a6eb..ea5bcf0 100644 --- a/python/examples/cccl_cooperative_block_reduce.py +++ b/python/examples/cccl_cooperative_block_reduce.py @@ -16,8 +16,8 @@ import sys +import cuda.bench as bench import cuda.cccl.cooperative.experimental as coop -import cuda.nvbench as nvbench import numba import numpy as np from numba import cuda @@ -45,11 +45,11 @@ class BitsetRing: return op1 & op2 -def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream: +def as_cuda_Stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream: return cuda.external_stream(cs.addressof()) -def multi_block_bench(state: nvbench.State): +def multi_block_bench(state: bench.State): threads_per_block = state.get_int64("ThreadsPerBlock") num_blocks = state.get_int64("NumBlocks") total_elements = threads_per_block * num_blocks @@ -78,15 +78,11 @@ def multi_block_bench(state: nvbench.State): d_inp = cuda.to_device(h_inp) d_out = cuda.device_array(num_blocks, dtype=ring.dt) - cuda_s = as_cuda_Stream(state.get_stream()) - # warmup - kernel[num_blocks, threads_per_block, cuda_s, 0](d_inp, d_out) - state.add_element_count(total_elements) state.add_global_memory_reads(total_elements * h_inp.itemsize) state.add_global_memory_writes(num_blocks * h_inp.itemsize) - def launcher(launch: nvbench.Launch): + def launcher(launch: bench.Launch): cuda_s = as_cuda_Stream(launch.get_stream()) kernel[num_blocks, threads_per_block, cuda_s, 0](d_inp, d_out) @@ -96,8 +92,8 @@ def multi_block_bench(state: nvbench.State): if __name__ == "__main__": patch.patch_numba_linker(lto=True) - b = nvbench.register(multi_block_bench) + b = bench.register(multi_block_bench) b.add_int64_axis("ThreadsPerBlock", [64, 128, 192, 256]) b.add_int64_power_of_two_axis("NumBlocks", [10, 11, 12, 14, 16]) - nvbench.run_all_benchmarks(sys.argv) + bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/cccl_parallel_segmented_reduce.py b/python/examples/cccl_parallel_segmented_reduce.py index 0f440e3..e54a77b 100644 --- a/python/examples/cccl_parallel_segmented_reduce.py +++ b/python/examples/cccl_parallel_segmented_reduce.py @@ -16,10 +16,10 @@ import sys +import cuda.bench as bench import cuda.cccl.parallel.experimental.algorithms as algorithms import cuda.cccl.parallel.experimental.iterators as iterators import cuda.core.experimental as core -import cuda.nvbench as nvbench import cupy as cp import numpy as np @@ -34,22 +34,22 @@ class CCCLStream: return (0, self._ptr) -def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream: +def as_core_Stream(cs: bench.CudaStream) -> core.Stream: return core.Stream.from_handle(cs.addressof()) -def as_cccl_Stream(cs: nvbench.CudaStream) -> CCCLStream: +def as_cccl_Stream(cs: bench.CudaStream) -> CCCLStream: return CCCLStream(cs.addressof()) def as_cp_ExternalStream( - cs: nvbench.CudaStream, dev_id: int | None = -1 + cs: bench.CudaStream, dev_id: int | None = -1 ) -> cp.cuda.ExternalStream: h = cs.addressof() return cp.cuda.ExternalStream(h, dev_id) -def segmented_reduce(state: nvbench.State): +def segmented_reduce(state: bench.State): "Benchmark segmented_reduce example" n_elems = state.get_int64("numElems") n_cols = state.get_int64("numCols") @@ -100,7 +100,7 @@ def segmented_reduce(state: nvbench.State): with cp_stream: temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8) - def launcher(launch: nvbench.Launch): + def launcher(launch: bench.Launch): s = as_cccl_Stream(launch.get_stream()) alg( temp_storage, @@ -117,8 +117,8 @@ def segmented_reduce(state: nvbench.State): if __name__ == "__main__": - b = nvbench.register(segmented_reduce) + b = bench.register(segmented_reduce) b.add_int64_axis("numElems", [2**20, 2**22, 2**24]) b.add_int64_axis("numCols", [1024, 2048, 4096, 8192]) - nvbench.run_all_benchmarks(sys.argv) + bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/cpu_activity.py b/python/examples/cpu_activity.py index 16f70cc..a492ff7 100644 --- a/python/examples/cpu_activity.py +++ b/python/examples/cpu_activity.py @@ -17,21 +17,21 @@ import sys import time +import cuda.bench as bench import cuda.cccl.headers as headers import cuda.core.experimental as core -import cuda.nvbench as nvbench host_sleep_duration = 0.1 -def cpu_only_sleep_bench(state: nvbench.State) -> None: - def launcher(launch: nvbench.Launch): +def cpu_only_sleep_bench(state: bench.State) -> None: + def launcher(launch: bench.Launch): time.sleep(host_sleep_duration) state.exec(launcher) -def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream: +def as_core_Stream(cs: bench.CudaStream) -> core.Stream: return core.Stream.from_handle(cs.addressof()) @@ -66,7 +66,7 @@ __global__ void sleep_kernel(double seconds) { return mod.get_kernel("sleep_kernel") -def mixed_sleep_bench(state: nvbench.State) -> None: +def mixed_sleep_bench(state: bench.State) -> None: sync = state.get_string("Sync") sync_flag = sync == "Do sync" @@ -74,7 +74,7 @@ def mixed_sleep_bench(state: nvbench.State) -> None: krn = make_sleep_kernel() launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0) - def launcher(launch: nvbench.Launch): + def launcher(launch: bench.Launch): # host overhead time.sleep(host_sleep_duration) # GPU computation @@ -87,11 +87,11 @@ def mixed_sleep_bench(state: nvbench.State) -> None: if __name__ == "__main__": # time function only doing work (sleeping) on the host # using CPU timer only - b = nvbench.register(cpu_only_sleep_bench) + b = bench.register(cpu_only_sleep_bench) b.set_is_cpu_only(True) # time the function that does work on both GPU and CPU - b2 = nvbench.register(mixed_sleep_bench) + b2 = bench.register(mixed_sleep_bench) b2.add_string_axis("Sync", ["Do not sync", "Do sync"]) - nvbench.run_all_benchmarks(sys.argv) + bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/cupy_extract.py b/python/examples/cupy_extract.py index 16e5d9f..091141c 100644 --- a/python/examples/cupy_extract.py +++ b/python/examples/cupy_extract.py @@ -16,18 +16,18 @@ import sys -import cuda.nvbench as nvbench +import cuda.bench as bench import cupy as cp def as_cp_ExternalStream( - cs: nvbench.CudaStream, dev_id: int | None = -1 + cs: bench.CudaStream, dev_id: int | None = -1 ) -> cp.cuda.ExternalStream: h = cs.addressof() return cp.cuda.ExternalStream(h, dev_id) -def cupy_extract_by_mask(state: nvbench.State): +def cupy_extract_by_mask(state: bench.State): n_cols = state.get_int64("numCols") n_rows = state.get_int64("numRows") @@ -48,7 +48,7 @@ def cupy_extract_by_mask(state: nvbench.State): mask = cp.ones((n_cols, n_rows), dtype=bool_dt) _ = X[mask] - def launcher(launch: nvbench.Launch): + def launcher(launch: bench.Launch): with as_cp_ExternalStream(launch.get_stream(), dev_id): _ = X[mask] @@ -56,8 +56,8 @@ def cupy_extract_by_mask(state: nvbench.State): if __name__ == "__main__": - b = nvbench.register(cupy_extract_by_mask) + b = bench.register(cupy_extract_by_mask) b.add_int64_axis("numCols", [1024, 2048, 4096, 2 * 4096]) b.add_int64_axis("numRows", [1024, 2048, 4096, 2 * 4096]) - nvbench.run_all_benchmarks(sys.argv) + bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/cutlass_gemm.py b/python/examples/cutlass_gemm.py index 154bc16..cd62f39 100644 --- a/python/examples/cutlass_gemm.py +++ b/python/examples/cutlass_gemm.py @@ -17,19 +17,19 @@ import sys +import cuda.bench as bench import cuda.bindings.driver as driver import cuda.core.experimental as core -import cuda.nvbench as nvbench import cupy as cp import cutlass import numpy as np -def as_bindings_Stream(cs: nvbench.CudaStream) -> driver.CUstream: +def as_bindings_Stream(cs: bench.CudaStream) -> driver.CUstream: return driver.CUstream(cs.addressof()) -def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream: +def as_core_Stream(cs: bench.CudaStream) -> core.Stream: return core.Stream.from_handle(cs.addressof()) @@ -47,7 +47,7 @@ def make_cp_array( ) -def cutlass_gemm(state: nvbench.State) -> None: +def cutlass_gemm(state: bench.State) -> None: n = state.get_int64("N") r = state.get_int64("R") @@ -96,7 +96,7 @@ def cutlass_gemm(state: nvbench.State) -> None: # warm-up to ensure compilation is not timed plan.run(stream=s) - def launcher(launch: nvbench.Launch) -> None: + def launcher(launch: bench.Launch) -> None: s = as_bindings_Stream(launch.get_stream()) plan.run(stream=s, sync=False) @@ -104,10 +104,10 @@ def cutlass_gemm(state: nvbench.State) -> None: if __name__ == "__main__": - gemm_b = nvbench.register(cutlass_gemm) + gemm_b = bench.register(cutlass_gemm) gemm_b.add_int64_axis("R", [16, 64, 256]) gemm_b.add_int64_axis("N", [256, 512, 1024, 2048]) gemm_b.add_float64_axis("alpha", [1e-2]) - nvbench.run_all_benchmarks(sys.argv) + bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/exec_tag_sync.py b/python/examples/exec_tag_sync.py index 8d0789a..b9ab5ef 100644 --- a/python/examples/exec_tag_sync.py +++ b/python/examples/exec_tag_sync.py @@ -18,12 +18,12 @@ import ctypes import sys from typing import Optional +import cuda.bench as bench import cuda.cccl.headers as headers import cuda.core.experimental as core -import cuda.nvbench as nvbench -def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream: +def as_core_Stream(cs: bench.CudaStream) -> core.Stream: "Create view of native stream used by NVBench" return core.Stream.from_handle(cs.addressof()) @@ -57,7 +57,7 @@ __global__ void fill_kernel(T *buf, T v, ::cuda::std::size_t n) return mod.get_kernel(instance_name) -def synchronizing_bench(state: nvbench.State): +def synchronizing_bench(state: bench.State): n_values = 64 * 1024 * 1024 n_bytes = n_values * ctypes.sizeof(ctypes.c_int32(0)) @@ -70,7 +70,7 @@ def synchronizing_bench(state: nvbench.State): krn = make_fill_kernel() launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0) - def launcher(launch: nvbench.Launch): + def launcher(launch: bench.Launch): s = as_core_Stream(launch.get_stream()) core.launch(s, launch_config, krn, buffer, 0, n_values) s.sync() @@ -81,5 +81,5 @@ def synchronizing_bench(state: nvbench.State): if __name__ == "__main__": - nvbench.register(synchronizing_bench) - nvbench.run_all_benchmarks(sys.argv) + bench.register(synchronizing_bench) + bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/pytorch_bench.py b/python/examples/pytorch_bench.py index f62a7a5..f49a543 100644 --- a/python/examples/pytorch_bench.py +++ b/python/examples/pytorch_bench.py @@ -1,18 +1,18 @@ import sys -import cuda.nvbench as nvbench +import cuda.bench as bench import torch def as_torch_cuda_Stream( - cs: nvbench.CudaStream, dev: int | None + cs: bench.CudaStream, dev: int | None ) -> torch.cuda.ExternalStream: return torch.cuda.ExternalStream( stream_ptr=cs.addressof(), device=torch.cuda.device(dev) ) -def torch_bench(state: nvbench.State) -> None: +def torch_bench(state: bench.State) -> None: state.set_throttle_threshold(0.25) dev_id = state.get_device() @@ -31,7 +31,7 @@ def torch_bench(state: nvbench.State) -> None: learning_rate = 1e-4 - def launcher(launch: nvbench.Launch) -> None: + def launcher(launch: bench.Launch) -> None: tc_s = as_torch_cuda_Stream(launch.get_stream(), dev_id) with torch.cuda.stream(tc_s): x2 = torch.square(x) @@ -53,6 +53,6 @@ def torch_bench(state: nvbench.State) -> None: if __name__ == "__main__": - nvbench.register(torch_bench) + bench.register(torch_bench) - nvbench.run_all_benchmarks(sys.argv) + bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/skip.py b/python/examples/skip.py index a5555d0..cf7ec90 100644 --- a/python/examples/skip.py +++ b/python/examples/skip.py @@ -16,12 +16,12 @@ import sys +import cuda.bench as bench import cuda.cccl.headers as headers import cuda.core.experimental as core -import cuda.nvbench as nvbench -def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream: +def as_core_Stream(cs: bench.CudaStream) -> core.Stream: "Create view into native stream provided by NVBench" return core.Stream.from_handle(cs.addressof()) @@ -57,7 +57,7 @@ __global__ void sleep_kernel(double seconds) { return mod.get_kernel("sleep_kernel") -def runtime_skip(state: nvbench.State): +def runtime_skip(state: bench.State): duration = state.get_float64("Duration") kramble = state.get_string("Kramble") @@ -74,7 +74,7 @@ def runtime_skip(state: nvbench.State): krn = make_sleep_kernel() launch_cfg = core.LaunchConfig(grid=1, block=1, shmem_size=0) - def launcher(launch: nvbench.Launch): + def launcher(launch: bench.Launch): s = as_core_Stream(launch.get_stream()) core.launch(s, launch_cfg, krn, duration) @@ -82,8 +82,8 @@ def runtime_skip(state: nvbench.State): if __name__ == "__main__": - b = nvbench.register(runtime_skip) + b = bench.register(runtime_skip) b.add_float64_axis("Duration", [1e-4 + k * 0.25e-3 for k in range(5)]) b.add_string_axis("Kramble", ["Foo", "Bar", "Baz"]) - nvbench.run_all_benchmarks(sys.argv) + bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/throughput.py b/python/examples/throughput.py index 890c372..ff02bd3 100644 --- a/python/examples/throughput.py +++ b/python/examples/throughput.py @@ -16,12 +16,12 @@ import sys -import cuda.nvbench as nvbench +import cuda.bench as bench import numpy as np from numba import cuda -def as_cuda_stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream: +def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream: return cuda.external_stream(cs.addressof()) @@ -39,7 +39,7 @@ def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatc return kernel -def throughput_bench(state: nvbench.State) -> None: +def throughput_bench(state: bench.State) -> None: stride = state.get_int64("Stride") ipt = state.get_int64("ItemsPerThread") @@ -59,7 +59,7 @@ def throughput_bench(state: nvbench.State) -> None: krn = make_throughput_kernel(ipt) - def launcher(launch: nvbench.Launch): + def launcher(launch: bench.Launch): exec_stream = as_cuda_stream(launch.get_stream()) krn[blocks_in_grid, threads_per_block, exec_stream, 0]( stride, elements, inp_arr, out_arr @@ -69,8 +69,8 @@ def throughput_bench(state: nvbench.State) -> None: if __name__ == "__main__": - b = nvbench.register(throughput_bench) + b = bench.register(throughput_bench) b.add_int64_axis("Stride", [1, 2, 4]) b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4]) - nvbench.run_all_benchmarks(sys.argv) + bench.run_all_benchmarks(sys.argv) diff --git a/python/pyproject.toml b/python/pyproject.toml index 22adc77..8466f64 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -56,4 +56,4 @@ root = ".." [tool.scikit-build.wheel.packages] "cuda" = "cuda" -"cuda/nvbench" = "cuda/nvbench" +"cuda/bench" = "cuda/bench" diff --git a/python/test/run_1.py b/python/test/run_1.py index dfa38f4..fbc6de0 100644 --- a/python/test/run_1.py +++ b/python/test/run_1.py @@ -1,6 +1,6 @@ import sys -import cuda.nvbench as nvbench +import cuda.bench as bench import numpy as np from numba import cuda @@ -14,15 +14,15 @@ def kernel(a, b, c): c[tid] = a[tid] + b[tid] -def get_numba_stream(launch: nvbench.Launch): +def get_numba_stream(launch: bench.Launch): return cuda.external_stream(launch.get_stream().addressof()) -def skipit(state: nvbench.State) -> None: +def skipit(state: bench.State) -> None: state.skip("Skipping this benchmark for no reason") -def add_two(state: nvbench.State): +def add_two(state: bench.State): N = state.get_int64("elements") a = cuda.to_device(np.random.random(N)) c = cuda.device_array_like(a) @@ -47,7 +47,7 @@ def add_two(state: nvbench.State): state.exec(kernel_launcher, batched=True, sync=True) -def add_float(state: nvbench.State): +def add_float(state: bench.State): N = state.get_int64("elements") v = state.get_float64("v") name = state.get_string("name") @@ -78,7 +78,7 @@ def add_float(state: nvbench.State): state.exec(kernel_launcher, batched=True, sync=True) -def add_three(state: nvbench.State): +def add_three(state: bench.State): N = state.get_int64("elements") a = cuda.to_device(np.random.random(N).astype(np.float32)) b = cuda.to_device(np.random.random(N).astype(np.float32)) @@ -100,20 +100,20 @@ def add_three(state: nvbench.State): def register_benchmarks(): ( - nvbench.register(add_two).add_int64_axis( - "elements", [2**pow2 for pow2 in range(20, 23)] + bench.register(add_two).add_int64_axis( + "elements", [2**pow2 - 1 for pow2 in range(20, 23)] ) ) ( - nvbench.register(add_float) + bench.register(add_float) .add_float64_axis("v", [0.1, 0.3]) .add_string_axis("name", ["Anne", "Lynda"]) .add_int64_power_of_two_axis("elements", range(20, 23)) ) - (nvbench.register(add_three).add_int64_power_of_two_axis("elements", range(20, 22))) - nvbench.register(skipit) + bench.register(add_three).add_int64_power_of_two_axis("elements", range(20, 22)) + bench.register(skipit) if __name__ == "__main__": register_benchmarks() - nvbench.run_all_benchmarks(sys.argv) + bench.run_all_benchmarks(sys.argv) diff --git a/python/test/stubs.py b/python/test/stubs.py index f3f4ee2..0d09a58 100644 --- a/python/test/stubs.py +++ b/python/test/stubs.py @@ -18,12 +18,12 @@ import ctypes import sys from typing import Dict, Optional, Tuple +import cuda.bench as bench import cuda.cccl.headers as headers import cuda.core.experimental as core -import cuda.nvbench as nvbench -def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream: +def as_core_Stream(cs: bench.CudaStream) -> core.Stream: return core.Stream.from_handle(cs.addressof()) @@ -58,7 +58,7 @@ __global__ void sleep_kernel(double seconds) { return mod.get_kernel("sleep_kernel") -def no_axes(state: nvbench.State): +def no_axes(state: bench.State): state.set_min_samples(1000) sleep_dur = 1e-3 krn = make_sleep_kernel() @@ -66,14 +66,14 @@ def no_axes(state: nvbench.State): print(f"Stopping criterion used: {state.get_stopping_criterion()}") - def launcher(launch: nvbench.Launch): + def launcher(launch: bench.Launch): s = as_core_Stream(launch.get_stream()) core.launch(s, launch_config, krn, sleep_dur) state.exec(launcher) -def tags(state: nvbench.State): +def tags(state: bench.State): state.set_min_samples(1000) sleep_dur = 1e-3 krn = make_sleep_kernel() @@ -82,28 +82,28 @@ def tags(state: nvbench.State): sync_flag = bool(state.get_int64("Sync")) batched_flag = bool(state.get_int64("Batched")) - def launcher(launch: nvbench.Launch): + def launcher(launch: bench.Launch): s = as_core_Stream(launch.get_stream()) core.launch(s, launch_config, krn, sleep_dur) state.exec(launcher, sync=sync_flag, batched=batched_flag) -def single_float64_axis(state: nvbench.State): +def single_float64_axis(state: bench.State): # get axis value, or default default_sleep_dur = 3.14e-4 sleep_dur = state.get_float64_or_default("Duration", default_sleep_dur) krn = make_sleep_kernel() launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0) - def launcher(launch: nvbench.Launch): + def launcher(launch: bench.Launch): s = as_core_Stream(launch.get_stream()) core.launch(s, launch_config, krn, sleep_dur) state.exec(launcher) -def default_value(state: nvbench.State): +def default_value(state: bench.State): single_float64_axis(state) @@ -138,7 +138,7 @@ __global__ void copy_kernel(const T *in, U *out, ::cuda::std::size_t n) return mod.get_kernel(instance_name) -def copy_sweep_grid_shape(state: nvbench.State): +def copy_sweep_grid_shape(state: bench.State): block_size = state.get_int64("BlockSize") num_blocks = state.get_int64("NumBlocks") @@ -158,14 +158,14 @@ def copy_sweep_grid_shape(state: nvbench.State): krn = make_copy_kernel() launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0) - def launcher(launch: nvbench.Launch): + def launcher(launch: bench.Launch): s = as_core_Stream(launch.get_stream()) core.launch(s, launch_config, krn, input_buf, output_buf, num_values) state.exec(launcher) -def copy_type_sweep(state: nvbench.State): +def copy_type_sweep(state: bench.State): type_id = state.get_int64("TypeID") types_map: Dict[int, Tuple[type, str]] = { @@ -196,7 +196,7 @@ def copy_type_sweep(state: nvbench.State): krn = make_copy_kernel(value_cuda_t, value_cuda_t) launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0) - def launcher(launch: nvbench.Launch): + def launcher(launch: bench.Launch): s = as_core_Stream(launch.get_stream()) core.launch(s, launch_config, krn, input_buf, output_buf, num_values) @@ -205,20 +205,20 @@ def copy_type_sweep(state: nvbench.State): if __name__ == "__main__": # Benchmark without axes - simple_b = nvbench.register(no_axes) + simple_b = bench.register(no_axes) simple_b.set_stopping_criterion("entropy") simple_b.set_criterion_param_int64("unused_int", 100) - tags_b = nvbench.register(tags) + tags_b = bench.register(tags) tags_b.add_int64_axis("Sync", [0, 1]) tags_b.add_int64_axis("Batched", [0, 1]) # benchmark with no axes, that uses default value - default_b = nvbench.register(default_value) + default_b = bench.register(default_value) default_b.set_min_samples(7) # specify axis - axes_b = nvbench.register(single_float64_axis).add_float64_axis( + axes_b = bench.register(single_float64_axis).add_float64_axis( "Duration", [7e-5, 1e-4, 5e-4] ) axes_b.set_timeout(20) @@ -226,11 +226,11 @@ if __name__ == "__main__": axes_b.set_throttle_threshold(0.2) axes_b.set_throttle_recovery_delay(0.1) - copy1_bench = nvbench.register(copy_sweep_grid_shape) + copy1_bench = bench.register(copy_sweep_grid_shape) copy1_bench.add_int64_power_of_two_axis("BlockSize", range(6, 10, 2)) copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)]) - copy2_bench = nvbench.register(copy_type_sweep) + copy2_bench = bench.register(copy_type_sweep) copy2_bench.add_int64_axis("TypeID", range(0, 6)) - nvbench.run_all_benchmarks(sys.argv) + bench.run_all_benchmarks(sys.argv) diff --git a/python/test/test_nvbench.py b/python/test/test_nvbench.py index d03e263..5604a3f 100644 --- a/python/test/test_nvbench.py +++ b/python/test/test_nvbench.py @@ -1,31 +1,31 @@ import json -import cuda.nvbench as nvbench +import cuda.bench as bench import pytest def test_cpp_exception(): with pytest.raises(RuntimeError, match="Test"): - nvbench._nvbench.test_cpp_exception() + bench._nvbench.test_cpp_exception() def test_py_exception(): - with pytest.raises(nvbench.NVBenchRuntimeError, match="Test"): - nvbench._nvbench.test_py_exception() + with pytest.raises(bench.NVBenchRuntimeError, match="Test"): + bench._nvbench.test_py_exception() @pytest.mark.parametrize( - "cls", [nvbench.CudaStream, nvbench.State, nvbench.Launch, nvbench.Benchmark] + "cls", [bench.CudaStream, bench.State, bench.Launch, bench.Benchmark] ) def test_api_ctor(cls): with pytest.raises(TypeError, match="No constructor defined!"): cls() -def t_bench(state: nvbench.State): +def t_bench(state: bench.State): s = {"a": 1, "b": 0.5, "c": "test", "d": {"a": 1}} - def launcher(launch: nvbench.Launch): + def launcher(launch: bench.Launch): for _ in range(10000): _ = json.dumps(s) @@ -33,7 +33,7 @@ def t_bench(state: nvbench.State): def test_cpu_only(): - b = nvbench.register(t_bench) + b = bench.register(t_bench) b.set_is_cpu_only(True) - nvbench.run_all_benchmarks(["-q", "--profile"]) + bench.run_all_benchmarks(["-q", "--profile"])