cuda.nvbench -> cuda.bench

Per PR review suggestion:
   - `cuda.parallel`    - device-wide algorithms/Thrust
   - `cuda.cooperative` - Cooperative algorithsm/CUB
   - `cuda.bench`       - Benchmarking/NVBench
This commit is contained in:
Oleksandr Pavlyk
2025-08-04 13:42:43 -05:00
parent c2a2acc9b6
commit b5e4b4ba31
19 changed files with 136 additions and 140 deletions

View File

@@ -33,9 +33,9 @@ set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN")
set_target_properties(_nvbench PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON) set_target_properties(_nvbench PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)
set_target_properties(_nvbench PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(_nvbench PROPERTIES POSITION_INDEPENDENT_CODE ON)
install(TARGETS _nvbench DESTINATION cuda/nvbench) install(TARGETS _nvbench DESTINATION cuda/bench)
# Determine target that nvbench::nvbench is an alias of, # Determine target that nvbench::nvbench is an alias of,
# necessary because ALIAS targets cannot be installed # necessary because ALIAS targets cannot be installed
get_target_property(_aliased_target_name nvbench::nvbench ALIASED_TARGET) get_target_property(_aliased_target_name nvbench::nvbench ALIASED_TARGET)
install(IMPORTED_RUNTIME_ARTIFACTS ${_aliased_target_name} DESTINATION cuda/nvbench) install(IMPORTED_RUNTIME_ARTIFACTS ${_aliased_target_name} DESTINATION cuda/bench)

View File

@@ -34,25 +34,25 @@ except Exception as e:
for libname in ("cupti", "nvperf_target", "nvperf_host"): for libname in ("cupti", "nvperf_target", "nvperf_host"):
load_nvidia_dynamic_lib(libname) load_nvidia_dynamic_lib(libname)
from cuda.nvbench._nvbench import ( # noqa: E402 from cuda.bench._nvbench import ( # noqa: E402
Benchmark as Benchmark, Benchmark as Benchmark,
) )
from cuda.nvbench._nvbench import ( # noqa: E402 from cuda.bench._nvbench import ( # noqa: E402
CudaStream as CudaStream, CudaStream as CudaStream,
) )
from cuda.nvbench._nvbench import ( # noqa: E402 from cuda.bench._nvbench import ( # noqa: E402
Launch as Launch, Launch as Launch,
) )
from cuda.nvbench._nvbench import ( # noqa: E402 from cuda.bench._nvbench import ( # noqa: E402
NVBenchRuntimeError as NVBenchRuntimeError, NVBenchRuntimeError as NVBenchRuntimeError,
) )
from cuda.nvbench._nvbench import ( # noqa: E402 from cuda.bench._nvbench import ( # noqa: E402
State as State, State as State,
) )
from cuda.nvbench._nvbench import ( # noqa: E402 from cuda.bench._nvbench import ( # noqa: E402
register as register, register as register,
) )
from cuda.nvbench._nvbench import ( # noqa: E402 from cuda.bench._nvbench import ( # noqa: E402
run_all_benchmarks as run_all_benchmarks, run_all_benchmarks as run_all_benchmarks,
) )

View File

@@ -44,9 +44,9 @@ class CudaStream:
Example Example
------- -------
import cuda.core.experimental as core import cuda.core.experimental as core
import cuda.nvbench as nvbench import cuda.bench as bench
def bench(state: nvbench.State): def bench(state: bench.State):
dev = core.Device(state.get_device()) dev = core.Device(state.get_device())
dev.set_current() dev.set_current()
# converts CudaString to core.Stream # converts CudaString to core.Stream

View File

@@ -16,12 +16,12 @@
import sys import sys
import cuda.nvbench as nvbench import cuda.bench as bench
import numpy as np import numpy as np
from numba import cuda from numba import cuda
def as_cuda_stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream: def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
return cuda.external_stream(cs.addressof()) return cuda.external_stream(cs.addressof())
@@ -39,7 +39,7 @@ def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatc
return kernel return kernel
def throughput_bench(state: nvbench.State) -> None: def throughput_bench(state: bench.State) -> None:
stride = state.get_int64("Stride") stride = state.get_int64("Stride")
ipt = state.get_int64("ItemsPerThread") ipt = state.get_int64("ItemsPerThread")
@@ -58,7 +58,7 @@ def throughput_bench(state: nvbench.State) -> None:
krn = make_throughput_kernel(ipt) krn = make_throughput_kernel(ipt)
def launcher(launch: nvbench.Launch): def launcher(launch: bench.Launch):
exec_stream = as_cuda_stream(launch.get_stream()) exec_stream = as_cuda_stream(launch.get_stream())
krn[blocks_in_grid, threads_per_block, exec_stream, 0]( krn[blocks_in_grid, threads_per_block, exec_stream, 0](
stride, elements, inp_arr, out_arr stride, elements, inp_arr, out_arr
@@ -68,8 +68,8 @@ def throughput_bench(state: nvbench.State) -> None:
if __name__ == "__main__": if __name__ == "__main__":
b = nvbench.register(throughput_bench) b = bench.register(throughput_bench)
b.add_int64_axis("Stride", [1, 2, 4]) b.add_int64_axis("Stride", [1, 2, 4])
b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4]) b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
nvbench.run_all_benchmarks(sys.argv) bench.run_all_benchmarks(sys.argv)

View File

@@ -18,12 +18,12 @@ import ctypes
import sys import sys
from typing import Dict, Optional, Tuple from typing import Dict, Optional, Tuple
import cuda.bench as bench
import cuda.cccl.headers as headers import cuda.cccl.headers as headers
import cuda.core.experimental as core import cuda.core.experimental as core
import cuda.nvbench as nvbench
def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream: def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
return core.Stream.from_handle(cs.addressof()) return core.Stream.from_handle(cs.addressof())
@@ -58,34 +58,34 @@ __global__ void sleep_kernel(double seconds) {
return mod.get_kernel("sleep_kernel") return mod.get_kernel("sleep_kernel")
def simple(state: nvbench.State): def simple(state: bench.State):
state.set_min_samples(1000) state.set_min_samples(1000)
sleep_dur = 1e-3 sleep_dur = 1e-3
krn = make_sleep_kernel() krn = make_sleep_kernel()
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0) launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
def launcher(launch: nvbench.Launch): def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream()) s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, sleep_dur) core.launch(s, launch_config, krn, sleep_dur)
state.exec(launcher) state.exec(launcher)
def single_float64_axis(state: nvbench.State): def single_float64_axis(state: bench.State):
# get axis value, or default # get axis value, or default
default_sleep_dur = 3.14e-4 default_sleep_dur = 3.14e-4
sleep_dur = state.get_float64_or_default("Duration", default_sleep_dur) sleep_dur = state.get_float64_or_default("Duration", default_sleep_dur)
krn = make_sleep_kernel() krn = make_sleep_kernel()
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0) launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
def launcher(launch: nvbench.Launch): def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream()) s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, sleep_dur) core.launch(s, launch_config, krn, sleep_dur)
state.exec(launcher) state.exec(launcher)
def default_value(state: nvbench.State): def default_value(state: bench.State):
single_float64_axis(state) single_float64_axis(state)
@@ -120,7 +120,7 @@ __global__ void copy_kernel(const T *in, U *out, ::cuda::std::size_t n)
return mod.get_kernel(instance_name) return mod.get_kernel(instance_name)
def copy_sweep_grid_shape(state: nvbench.State): def copy_sweep_grid_shape(state: bench.State):
block_size = state.get_int64("BlockSize") block_size = state.get_int64("BlockSize")
num_blocks = state.get_int64("NumBlocks") num_blocks = state.get_int64("NumBlocks")
@@ -140,14 +140,14 @@ def copy_sweep_grid_shape(state: nvbench.State):
krn = make_copy_kernel() krn = make_copy_kernel()
launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0) launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
def launcher(launch: nvbench.Launch): def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream()) s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, input_buf, output_buf, num_values) core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
state.exec(launcher) state.exec(launcher)
def copy_type_sweep(state: nvbench.State): def copy_type_sweep(state: bench.State):
type_id = state.get_int64("TypeID") type_id = state.get_int64("TypeID")
types_map: Dict[int, Tuple[type, str]] = { types_map: Dict[int, Tuple[type, str]] = {
@@ -178,7 +178,7 @@ def copy_type_sweep(state: nvbench.State):
krn = make_copy_kernel(value_cuda_t, value_cuda_t) krn = make_copy_kernel(value_cuda_t, value_cuda_t)
launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0) launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
def launcher(launch: nvbench.Launch): def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream()) s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, input_buf, output_buf, num_values) core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
@@ -187,20 +187,20 @@ def copy_type_sweep(state: nvbench.State):
if __name__ == "__main__": if __name__ == "__main__":
# Benchmark without axes # Benchmark without axes
nvbench.register(simple) bench.register(simple)
# benchmark with no axes, that uses default value # benchmark with no axes, that uses default value
nvbench.register(default_value) bench.register(default_value)
# specify axis # specify axis
nvbench.register(single_float64_axis).add_float64_axis( bench.register(single_float64_axis).add_float64_axis(
"Duration (s)", [7e-5, 1e-4, 5e-4] "Duration (s)", [7e-5, 1e-4, 5e-4]
) )
copy1_bench = nvbench.register(copy_sweep_grid_shape) copy1_bench = bench.register(copy_sweep_grid_shape)
copy1_bench.add_int64_axis("BlockSize", [2**x for x in range(6, 10, 2)]) copy1_bench.add_int64_axis("BlockSize", [2**x for x in range(6, 10, 2)])
copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)]) copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)])
copy2_bench = nvbench.register(copy_type_sweep) copy2_bench = bench.register(copy_type_sweep)
copy2_bench.add_int64_axis("TypeID", range(0, 6)) copy2_bench.add_int64_axis("TypeID", range(0, 6))
nvbench.run_all_benchmarks(sys.argv) bench.run_all_benchmarks(sys.argv)

View File

@@ -16,8 +16,8 @@
import sys import sys
import cuda.bench as bench
import cuda.cccl.cooperative.experimental as coop import cuda.cccl.cooperative.experimental as coop
import cuda.nvbench as nvbench
import numba import numba
import numpy as np import numpy as np
from numba import cuda from numba import cuda
@@ -45,11 +45,11 @@ class BitsetRing:
return op1 & op2 return op1 & op2
def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream: def as_cuda_Stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
return cuda.external_stream(cs.addressof()) return cuda.external_stream(cs.addressof())
def multi_block_bench(state: nvbench.State): def multi_block_bench(state: bench.State):
threads_per_block = state.get_int64("ThreadsPerBlock") threads_per_block = state.get_int64("ThreadsPerBlock")
num_blocks = state.get_int64("NumBlocks") num_blocks = state.get_int64("NumBlocks")
total_elements = threads_per_block * num_blocks total_elements = threads_per_block * num_blocks
@@ -78,15 +78,11 @@ def multi_block_bench(state: nvbench.State):
d_inp = cuda.to_device(h_inp) d_inp = cuda.to_device(h_inp)
d_out = cuda.device_array(num_blocks, dtype=ring.dt) d_out = cuda.device_array(num_blocks, dtype=ring.dt)
cuda_s = as_cuda_Stream(state.get_stream())
# warmup
kernel[num_blocks, threads_per_block, cuda_s, 0](d_inp, d_out)
state.add_element_count(total_elements) state.add_element_count(total_elements)
state.add_global_memory_reads(total_elements * h_inp.itemsize) state.add_global_memory_reads(total_elements * h_inp.itemsize)
state.add_global_memory_writes(num_blocks * h_inp.itemsize) state.add_global_memory_writes(num_blocks * h_inp.itemsize)
def launcher(launch: nvbench.Launch): def launcher(launch: bench.Launch):
cuda_s = as_cuda_Stream(launch.get_stream()) cuda_s = as_cuda_Stream(launch.get_stream())
kernel[num_blocks, threads_per_block, cuda_s, 0](d_inp, d_out) kernel[num_blocks, threads_per_block, cuda_s, 0](d_inp, d_out)
@@ -96,8 +92,8 @@ def multi_block_bench(state: nvbench.State):
if __name__ == "__main__": if __name__ == "__main__":
patch.patch_numba_linker(lto=True) patch.patch_numba_linker(lto=True)
b = nvbench.register(multi_block_bench) b = bench.register(multi_block_bench)
b.add_int64_axis("ThreadsPerBlock", [64, 128, 192, 256]) b.add_int64_axis("ThreadsPerBlock", [64, 128, 192, 256])
b.add_int64_power_of_two_axis("NumBlocks", [10, 11, 12, 14, 16]) b.add_int64_power_of_two_axis("NumBlocks", [10, 11, 12, 14, 16])
nvbench.run_all_benchmarks(sys.argv) bench.run_all_benchmarks(sys.argv)

View File

@@ -16,10 +16,10 @@
import sys import sys
import cuda.bench as bench
import cuda.cccl.parallel.experimental.algorithms as algorithms import cuda.cccl.parallel.experimental.algorithms as algorithms
import cuda.cccl.parallel.experimental.iterators as iterators import cuda.cccl.parallel.experimental.iterators as iterators
import cuda.core.experimental as core import cuda.core.experimental as core
import cuda.nvbench as nvbench
import cupy as cp import cupy as cp
import numpy as np import numpy as np
@@ -34,22 +34,22 @@ class CCCLStream:
return (0, self._ptr) return (0, self._ptr)
def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream: def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
return core.Stream.from_handle(cs.addressof()) return core.Stream.from_handle(cs.addressof())
def as_cccl_Stream(cs: nvbench.CudaStream) -> CCCLStream: def as_cccl_Stream(cs: bench.CudaStream) -> CCCLStream:
return CCCLStream(cs.addressof()) return CCCLStream(cs.addressof())
def as_cp_ExternalStream( def as_cp_ExternalStream(
cs: nvbench.CudaStream, dev_id: int | None = -1 cs: bench.CudaStream, dev_id: int | None = -1
) -> cp.cuda.ExternalStream: ) -> cp.cuda.ExternalStream:
h = cs.addressof() h = cs.addressof()
return cp.cuda.ExternalStream(h, dev_id) return cp.cuda.ExternalStream(h, dev_id)
def segmented_reduce(state: nvbench.State): def segmented_reduce(state: bench.State):
"Benchmark segmented_reduce example" "Benchmark segmented_reduce example"
n_elems = state.get_int64("numElems") n_elems = state.get_int64("numElems")
n_cols = state.get_int64("numCols") n_cols = state.get_int64("numCols")
@@ -100,7 +100,7 @@ def segmented_reduce(state: nvbench.State):
with cp_stream: with cp_stream:
temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8) temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
def launcher(launch: nvbench.Launch): def launcher(launch: bench.Launch):
s = as_cccl_Stream(launch.get_stream()) s = as_cccl_Stream(launch.get_stream())
alg( alg(
temp_storage, temp_storage,
@@ -117,8 +117,8 @@ def segmented_reduce(state: nvbench.State):
if __name__ == "__main__": if __name__ == "__main__":
b = nvbench.register(segmented_reduce) b = bench.register(segmented_reduce)
b.add_int64_axis("numElems", [2**20, 2**22, 2**24]) b.add_int64_axis("numElems", [2**20, 2**22, 2**24])
b.add_int64_axis("numCols", [1024, 2048, 4096, 8192]) b.add_int64_axis("numCols", [1024, 2048, 4096, 8192])
nvbench.run_all_benchmarks(sys.argv) bench.run_all_benchmarks(sys.argv)

View File

@@ -17,21 +17,21 @@
import sys import sys
import time import time
import cuda.bench as bench
import cuda.cccl.headers as headers import cuda.cccl.headers as headers
import cuda.core.experimental as core import cuda.core.experimental as core
import cuda.nvbench as nvbench
host_sleep_duration = 0.1 host_sleep_duration = 0.1
def cpu_only_sleep_bench(state: nvbench.State) -> None: def cpu_only_sleep_bench(state: bench.State) -> None:
def launcher(launch: nvbench.Launch): def launcher(launch: bench.Launch):
time.sleep(host_sleep_duration) time.sleep(host_sleep_duration)
state.exec(launcher) state.exec(launcher)
def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream: def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
return core.Stream.from_handle(cs.addressof()) return core.Stream.from_handle(cs.addressof())
@@ -66,7 +66,7 @@ __global__ void sleep_kernel(double seconds) {
return mod.get_kernel("sleep_kernel") return mod.get_kernel("sleep_kernel")
def mixed_sleep_bench(state: nvbench.State) -> None: def mixed_sleep_bench(state: bench.State) -> None:
sync = state.get_string("Sync") sync = state.get_string("Sync")
sync_flag = sync == "Do sync" sync_flag = sync == "Do sync"
@@ -74,7 +74,7 @@ def mixed_sleep_bench(state: nvbench.State) -> None:
krn = make_sleep_kernel() krn = make_sleep_kernel()
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0) launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
def launcher(launch: nvbench.Launch): def launcher(launch: bench.Launch):
# host overhead # host overhead
time.sleep(host_sleep_duration) time.sleep(host_sleep_duration)
# GPU computation # GPU computation
@@ -87,11 +87,11 @@ def mixed_sleep_bench(state: nvbench.State) -> None:
if __name__ == "__main__": if __name__ == "__main__":
# time function only doing work (sleeping) on the host # time function only doing work (sleeping) on the host
# using CPU timer only # using CPU timer only
b = nvbench.register(cpu_only_sleep_bench) b = bench.register(cpu_only_sleep_bench)
b.set_is_cpu_only(True) b.set_is_cpu_only(True)
# time the function that does work on both GPU and CPU # time the function that does work on both GPU and CPU
b2 = nvbench.register(mixed_sleep_bench) b2 = bench.register(mixed_sleep_bench)
b2.add_string_axis("Sync", ["Do not sync", "Do sync"]) b2.add_string_axis("Sync", ["Do not sync", "Do sync"])
nvbench.run_all_benchmarks(sys.argv) bench.run_all_benchmarks(sys.argv)

View File

@@ -16,18 +16,18 @@
import sys import sys
import cuda.nvbench as nvbench import cuda.bench as bench
import cupy as cp import cupy as cp
def as_cp_ExternalStream( def as_cp_ExternalStream(
cs: nvbench.CudaStream, dev_id: int | None = -1 cs: bench.CudaStream, dev_id: int | None = -1
) -> cp.cuda.ExternalStream: ) -> cp.cuda.ExternalStream:
h = cs.addressof() h = cs.addressof()
return cp.cuda.ExternalStream(h, dev_id) return cp.cuda.ExternalStream(h, dev_id)
def cupy_extract_by_mask(state: nvbench.State): def cupy_extract_by_mask(state: bench.State):
n_cols = state.get_int64("numCols") n_cols = state.get_int64("numCols")
n_rows = state.get_int64("numRows") n_rows = state.get_int64("numRows")
@@ -48,7 +48,7 @@ def cupy_extract_by_mask(state: nvbench.State):
mask = cp.ones((n_cols, n_rows), dtype=bool_dt) mask = cp.ones((n_cols, n_rows), dtype=bool_dt)
_ = X[mask] _ = X[mask]
def launcher(launch: nvbench.Launch): def launcher(launch: bench.Launch):
with as_cp_ExternalStream(launch.get_stream(), dev_id): with as_cp_ExternalStream(launch.get_stream(), dev_id):
_ = X[mask] _ = X[mask]
@@ -56,8 +56,8 @@ def cupy_extract_by_mask(state: nvbench.State):
if __name__ == "__main__": if __name__ == "__main__":
b = nvbench.register(cupy_extract_by_mask) b = bench.register(cupy_extract_by_mask)
b.add_int64_axis("numCols", [1024, 2048, 4096, 2 * 4096]) b.add_int64_axis("numCols", [1024, 2048, 4096, 2 * 4096])
b.add_int64_axis("numRows", [1024, 2048, 4096, 2 * 4096]) b.add_int64_axis("numRows", [1024, 2048, 4096, 2 * 4096])
nvbench.run_all_benchmarks(sys.argv) bench.run_all_benchmarks(sys.argv)

View File

@@ -17,19 +17,19 @@
import sys import sys
import cuda.bench as bench
import cuda.bindings.driver as driver import cuda.bindings.driver as driver
import cuda.core.experimental as core import cuda.core.experimental as core
import cuda.nvbench as nvbench
import cupy as cp import cupy as cp
import cutlass import cutlass
import numpy as np import numpy as np
def as_bindings_Stream(cs: nvbench.CudaStream) -> driver.CUstream: def as_bindings_Stream(cs: bench.CudaStream) -> driver.CUstream:
return driver.CUstream(cs.addressof()) return driver.CUstream(cs.addressof())
def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream: def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
return core.Stream.from_handle(cs.addressof()) return core.Stream.from_handle(cs.addressof())
@@ -47,7 +47,7 @@ def make_cp_array(
) )
def cutlass_gemm(state: nvbench.State) -> None: def cutlass_gemm(state: bench.State) -> None:
n = state.get_int64("N") n = state.get_int64("N")
r = state.get_int64("R") r = state.get_int64("R")
@@ -96,7 +96,7 @@ def cutlass_gemm(state: nvbench.State) -> None:
# warm-up to ensure compilation is not timed # warm-up to ensure compilation is not timed
plan.run(stream=s) plan.run(stream=s)
def launcher(launch: nvbench.Launch) -> None: def launcher(launch: bench.Launch) -> None:
s = as_bindings_Stream(launch.get_stream()) s = as_bindings_Stream(launch.get_stream())
plan.run(stream=s, sync=False) plan.run(stream=s, sync=False)
@@ -104,10 +104,10 @@ def cutlass_gemm(state: nvbench.State) -> None:
if __name__ == "__main__": if __name__ == "__main__":
gemm_b = nvbench.register(cutlass_gemm) gemm_b = bench.register(cutlass_gemm)
gemm_b.add_int64_axis("R", [16, 64, 256]) gemm_b.add_int64_axis("R", [16, 64, 256])
gemm_b.add_int64_axis("N", [256, 512, 1024, 2048]) gemm_b.add_int64_axis("N", [256, 512, 1024, 2048])
gemm_b.add_float64_axis("alpha", [1e-2]) gemm_b.add_float64_axis("alpha", [1e-2])
nvbench.run_all_benchmarks(sys.argv) bench.run_all_benchmarks(sys.argv)

View File

@@ -18,12 +18,12 @@ import ctypes
import sys import sys
from typing import Optional from typing import Optional
import cuda.bench as bench
import cuda.cccl.headers as headers import cuda.cccl.headers as headers
import cuda.core.experimental as core import cuda.core.experimental as core
import cuda.nvbench as nvbench
def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream: def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
"Create view of native stream used by NVBench" "Create view of native stream used by NVBench"
return core.Stream.from_handle(cs.addressof()) return core.Stream.from_handle(cs.addressof())
@@ -57,7 +57,7 @@ __global__ void fill_kernel(T *buf, T v, ::cuda::std::size_t n)
return mod.get_kernel(instance_name) return mod.get_kernel(instance_name)
def synchronizing_bench(state: nvbench.State): def synchronizing_bench(state: bench.State):
n_values = 64 * 1024 * 1024 n_values = 64 * 1024 * 1024
n_bytes = n_values * ctypes.sizeof(ctypes.c_int32(0)) n_bytes = n_values * ctypes.sizeof(ctypes.c_int32(0))
@@ -70,7 +70,7 @@ def synchronizing_bench(state: nvbench.State):
krn = make_fill_kernel() krn = make_fill_kernel()
launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0) launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
def launcher(launch: nvbench.Launch): def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream()) s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, buffer, 0, n_values) core.launch(s, launch_config, krn, buffer, 0, n_values)
s.sync() s.sync()
@@ -81,5 +81,5 @@ def synchronizing_bench(state: nvbench.State):
if __name__ == "__main__": if __name__ == "__main__":
nvbench.register(synchronizing_bench) bench.register(synchronizing_bench)
nvbench.run_all_benchmarks(sys.argv) bench.run_all_benchmarks(sys.argv)

View File

@@ -1,18 +1,18 @@
import sys import sys
import cuda.nvbench as nvbench import cuda.bench as bench
import torch import torch
def as_torch_cuda_Stream( def as_torch_cuda_Stream(
cs: nvbench.CudaStream, dev: int | None cs: bench.CudaStream, dev: int | None
) -> torch.cuda.ExternalStream: ) -> torch.cuda.ExternalStream:
return torch.cuda.ExternalStream( return torch.cuda.ExternalStream(
stream_ptr=cs.addressof(), device=torch.cuda.device(dev) stream_ptr=cs.addressof(), device=torch.cuda.device(dev)
) )
def torch_bench(state: nvbench.State) -> None: def torch_bench(state: bench.State) -> None:
state.set_throttle_threshold(0.25) state.set_throttle_threshold(0.25)
dev_id = state.get_device() dev_id = state.get_device()
@@ -31,7 +31,7 @@ def torch_bench(state: nvbench.State) -> None:
learning_rate = 1e-4 learning_rate = 1e-4
def launcher(launch: nvbench.Launch) -> None: def launcher(launch: bench.Launch) -> None:
tc_s = as_torch_cuda_Stream(launch.get_stream(), dev_id) tc_s = as_torch_cuda_Stream(launch.get_stream(), dev_id)
with torch.cuda.stream(tc_s): with torch.cuda.stream(tc_s):
x2 = torch.square(x) x2 = torch.square(x)
@@ -53,6 +53,6 @@ def torch_bench(state: nvbench.State) -> None:
if __name__ == "__main__": if __name__ == "__main__":
nvbench.register(torch_bench) bench.register(torch_bench)
nvbench.run_all_benchmarks(sys.argv) bench.run_all_benchmarks(sys.argv)

View File

@@ -16,12 +16,12 @@
import sys import sys
import cuda.bench as bench
import cuda.cccl.headers as headers import cuda.cccl.headers as headers
import cuda.core.experimental as core import cuda.core.experimental as core
import cuda.nvbench as nvbench
def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream: def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
"Create view into native stream provided by NVBench" "Create view into native stream provided by NVBench"
return core.Stream.from_handle(cs.addressof()) return core.Stream.from_handle(cs.addressof())
@@ -57,7 +57,7 @@ __global__ void sleep_kernel(double seconds) {
return mod.get_kernel("sleep_kernel") return mod.get_kernel("sleep_kernel")
def runtime_skip(state: nvbench.State): def runtime_skip(state: bench.State):
duration = state.get_float64("Duration") duration = state.get_float64("Duration")
kramble = state.get_string("Kramble") kramble = state.get_string("Kramble")
@@ -74,7 +74,7 @@ def runtime_skip(state: nvbench.State):
krn = make_sleep_kernel() krn = make_sleep_kernel()
launch_cfg = core.LaunchConfig(grid=1, block=1, shmem_size=0) launch_cfg = core.LaunchConfig(grid=1, block=1, shmem_size=0)
def launcher(launch: nvbench.Launch): def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream()) s = as_core_Stream(launch.get_stream())
core.launch(s, launch_cfg, krn, duration) core.launch(s, launch_cfg, krn, duration)
@@ -82,8 +82,8 @@ def runtime_skip(state: nvbench.State):
if __name__ == "__main__": if __name__ == "__main__":
b = nvbench.register(runtime_skip) b = bench.register(runtime_skip)
b.add_float64_axis("Duration", [1e-4 + k * 0.25e-3 for k in range(5)]) b.add_float64_axis("Duration", [1e-4 + k * 0.25e-3 for k in range(5)])
b.add_string_axis("Kramble", ["Foo", "Bar", "Baz"]) b.add_string_axis("Kramble", ["Foo", "Bar", "Baz"])
nvbench.run_all_benchmarks(sys.argv) bench.run_all_benchmarks(sys.argv)

View File

@@ -16,12 +16,12 @@
import sys import sys
import cuda.nvbench as nvbench import cuda.bench as bench
import numpy as np import numpy as np
from numba import cuda from numba import cuda
def as_cuda_stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream: def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
return cuda.external_stream(cs.addressof()) return cuda.external_stream(cs.addressof())
@@ -39,7 +39,7 @@ def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatc
return kernel return kernel
def throughput_bench(state: nvbench.State) -> None: def throughput_bench(state: bench.State) -> None:
stride = state.get_int64("Stride") stride = state.get_int64("Stride")
ipt = state.get_int64("ItemsPerThread") ipt = state.get_int64("ItemsPerThread")
@@ -59,7 +59,7 @@ def throughput_bench(state: nvbench.State) -> None:
krn = make_throughput_kernel(ipt) krn = make_throughput_kernel(ipt)
def launcher(launch: nvbench.Launch): def launcher(launch: bench.Launch):
exec_stream = as_cuda_stream(launch.get_stream()) exec_stream = as_cuda_stream(launch.get_stream())
krn[blocks_in_grid, threads_per_block, exec_stream, 0]( krn[blocks_in_grid, threads_per_block, exec_stream, 0](
stride, elements, inp_arr, out_arr stride, elements, inp_arr, out_arr
@@ -69,8 +69,8 @@ def throughput_bench(state: nvbench.State) -> None:
if __name__ == "__main__": if __name__ == "__main__":
b = nvbench.register(throughput_bench) b = bench.register(throughput_bench)
b.add_int64_axis("Stride", [1, 2, 4]) b.add_int64_axis("Stride", [1, 2, 4])
b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4]) b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
nvbench.run_all_benchmarks(sys.argv) bench.run_all_benchmarks(sys.argv)

View File

@@ -56,4 +56,4 @@ root = ".."
[tool.scikit-build.wheel.packages] [tool.scikit-build.wheel.packages]
"cuda" = "cuda" "cuda" = "cuda"
"cuda/nvbench" = "cuda/nvbench" "cuda/bench" = "cuda/bench"

View File

@@ -1,6 +1,6 @@
import sys import sys
import cuda.nvbench as nvbench import cuda.bench as bench
import numpy as np import numpy as np
from numba import cuda from numba import cuda
@@ -14,15 +14,15 @@ def kernel(a, b, c):
c[tid] = a[tid] + b[tid] c[tid] = a[tid] + b[tid]
def get_numba_stream(launch: nvbench.Launch): def get_numba_stream(launch: bench.Launch):
return cuda.external_stream(launch.get_stream().addressof()) return cuda.external_stream(launch.get_stream().addressof())
def skipit(state: nvbench.State) -> None: def skipit(state: bench.State) -> None:
state.skip("Skipping this benchmark for no reason") state.skip("Skipping this benchmark for no reason")
def add_two(state: nvbench.State): def add_two(state: bench.State):
N = state.get_int64("elements") N = state.get_int64("elements")
a = cuda.to_device(np.random.random(N)) a = cuda.to_device(np.random.random(N))
c = cuda.device_array_like(a) c = cuda.device_array_like(a)
@@ -47,7 +47,7 @@ def add_two(state: nvbench.State):
state.exec(kernel_launcher, batched=True, sync=True) state.exec(kernel_launcher, batched=True, sync=True)
def add_float(state: nvbench.State): def add_float(state: bench.State):
N = state.get_int64("elements") N = state.get_int64("elements")
v = state.get_float64("v") v = state.get_float64("v")
name = state.get_string("name") name = state.get_string("name")
@@ -78,7 +78,7 @@ def add_float(state: nvbench.State):
state.exec(kernel_launcher, batched=True, sync=True) state.exec(kernel_launcher, batched=True, sync=True)
def add_three(state: nvbench.State): def add_three(state: bench.State):
N = state.get_int64("elements") N = state.get_int64("elements")
a = cuda.to_device(np.random.random(N).astype(np.float32)) a = cuda.to_device(np.random.random(N).astype(np.float32))
b = cuda.to_device(np.random.random(N).astype(np.float32)) b = cuda.to_device(np.random.random(N).astype(np.float32))
@@ -100,20 +100,20 @@ def add_three(state: nvbench.State):
def register_benchmarks(): def register_benchmarks():
( (
nvbench.register(add_two).add_int64_axis( bench.register(add_two).add_int64_axis(
"elements", [2**pow2 for pow2 in range(20, 23)] "elements", [2**pow2 - 1 for pow2 in range(20, 23)]
) )
) )
( (
nvbench.register(add_float) bench.register(add_float)
.add_float64_axis("v", [0.1, 0.3]) .add_float64_axis("v", [0.1, 0.3])
.add_string_axis("name", ["Anne", "Lynda"]) .add_string_axis("name", ["Anne", "Lynda"])
.add_int64_power_of_two_axis("elements", range(20, 23)) .add_int64_power_of_two_axis("elements", range(20, 23))
) )
(nvbench.register(add_three).add_int64_power_of_two_axis("elements", range(20, 22))) bench.register(add_three).add_int64_power_of_two_axis("elements", range(20, 22))
nvbench.register(skipit) bench.register(skipit)
if __name__ == "__main__": if __name__ == "__main__":
register_benchmarks() register_benchmarks()
nvbench.run_all_benchmarks(sys.argv) bench.run_all_benchmarks(sys.argv)

View File

@@ -18,12 +18,12 @@ import ctypes
import sys import sys
from typing import Dict, Optional, Tuple from typing import Dict, Optional, Tuple
import cuda.bench as bench
import cuda.cccl.headers as headers import cuda.cccl.headers as headers
import cuda.core.experimental as core import cuda.core.experimental as core
import cuda.nvbench as nvbench
def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream: def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
return core.Stream.from_handle(cs.addressof()) return core.Stream.from_handle(cs.addressof())
@@ -58,7 +58,7 @@ __global__ void sleep_kernel(double seconds) {
return mod.get_kernel("sleep_kernel") return mod.get_kernel("sleep_kernel")
def no_axes(state: nvbench.State): def no_axes(state: bench.State):
state.set_min_samples(1000) state.set_min_samples(1000)
sleep_dur = 1e-3 sleep_dur = 1e-3
krn = make_sleep_kernel() krn = make_sleep_kernel()
@@ -66,14 +66,14 @@ def no_axes(state: nvbench.State):
print(f"Stopping criterion used: {state.get_stopping_criterion()}") print(f"Stopping criterion used: {state.get_stopping_criterion()}")
def launcher(launch: nvbench.Launch): def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream()) s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, sleep_dur) core.launch(s, launch_config, krn, sleep_dur)
state.exec(launcher) state.exec(launcher)
def tags(state: nvbench.State): def tags(state: bench.State):
state.set_min_samples(1000) state.set_min_samples(1000)
sleep_dur = 1e-3 sleep_dur = 1e-3
krn = make_sleep_kernel() krn = make_sleep_kernel()
@@ -82,28 +82,28 @@ def tags(state: nvbench.State):
sync_flag = bool(state.get_int64("Sync")) sync_flag = bool(state.get_int64("Sync"))
batched_flag = bool(state.get_int64("Batched")) batched_flag = bool(state.get_int64("Batched"))
def launcher(launch: nvbench.Launch): def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream()) s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, sleep_dur) core.launch(s, launch_config, krn, sleep_dur)
state.exec(launcher, sync=sync_flag, batched=batched_flag) state.exec(launcher, sync=sync_flag, batched=batched_flag)
def single_float64_axis(state: nvbench.State): def single_float64_axis(state: bench.State):
# get axis value, or default # get axis value, or default
default_sleep_dur = 3.14e-4 default_sleep_dur = 3.14e-4
sleep_dur = state.get_float64_or_default("Duration", default_sleep_dur) sleep_dur = state.get_float64_or_default("Duration", default_sleep_dur)
krn = make_sleep_kernel() krn = make_sleep_kernel()
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0) launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
def launcher(launch: nvbench.Launch): def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream()) s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, sleep_dur) core.launch(s, launch_config, krn, sleep_dur)
state.exec(launcher) state.exec(launcher)
def default_value(state: nvbench.State): def default_value(state: bench.State):
single_float64_axis(state) single_float64_axis(state)
@@ -138,7 +138,7 @@ __global__ void copy_kernel(const T *in, U *out, ::cuda::std::size_t n)
return mod.get_kernel(instance_name) return mod.get_kernel(instance_name)
def copy_sweep_grid_shape(state: nvbench.State): def copy_sweep_grid_shape(state: bench.State):
block_size = state.get_int64("BlockSize") block_size = state.get_int64("BlockSize")
num_blocks = state.get_int64("NumBlocks") num_blocks = state.get_int64("NumBlocks")
@@ -158,14 +158,14 @@ def copy_sweep_grid_shape(state: nvbench.State):
krn = make_copy_kernel() krn = make_copy_kernel()
launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0) launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
def launcher(launch: nvbench.Launch): def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream()) s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, input_buf, output_buf, num_values) core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
state.exec(launcher) state.exec(launcher)
def copy_type_sweep(state: nvbench.State): def copy_type_sweep(state: bench.State):
type_id = state.get_int64("TypeID") type_id = state.get_int64("TypeID")
types_map: Dict[int, Tuple[type, str]] = { types_map: Dict[int, Tuple[type, str]] = {
@@ -196,7 +196,7 @@ def copy_type_sweep(state: nvbench.State):
krn = make_copy_kernel(value_cuda_t, value_cuda_t) krn = make_copy_kernel(value_cuda_t, value_cuda_t)
launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0) launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
def launcher(launch: nvbench.Launch): def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream()) s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, input_buf, output_buf, num_values) core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
@@ -205,20 +205,20 @@ def copy_type_sweep(state: nvbench.State):
if __name__ == "__main__": if __name__ == "__main__":
# Benchmark without axes # Benchmark without axes
simple_b = nvbench.register(no_axes) simple_b = bench.register(no_axes)
simple_b.set_stopping_criterion("entropy") simple_b.set_stopping_criterion("entropy")
simple_b.set_criterion_param_int64("unused_int", 100) simple_b.set_criterion_param_int64("unused_int", 100)
tags_b = nvbench.register(tags) tags_b = bench.register(tags)
tags_b.add_int64_axis("Sync", [0, 1]) tags_b.add_int64_axis("Sync", [0, 1])
tags_b.add_int64_axis("Batched", [0, 1]) tags_b.add_int64_axis("Batched", [0, 1])
# benchmark with no axes, that uses default value # benchmark with no axes, that uses default value
default_b = nvbench.register(default_value) default_b = bench.register(default_value)
default_b.set_min_samples(7) default_b.set_min_samples(7)
# specify axis # specify axis
axes_b = nvbench.register(single_float64_axis).add_float64_axis( axes_b = bench.register(single_float64_axis).add_float64_axis(
"Duration", [7e-5, 1e-4, 5e-4] "Duration", [7e-5, 1e-4, 5e-4]
) )
axes_b.set_timeout(20) axes_b.set_timeout(20)
@@ -226,11 +226,11 @@ if __name__ == "__main__":
axes_b.set_throttle_threshold(0.2) axes_b.set_throttle_threshold(0.2)
axes_b.set_throttle_recovery_delay(0.1) axes_b.set_throttle_recovery_delay(0.1)
copy1_bench = nvbench.register(copy_sweep_grid_shape) copy1_bench = bench.register(copy_sweep_grid_shape)
copy1_bench.add_int64_power_of_two_axis("BlockSize", range(6, 10, 2)) copy1_bench.add_int64_power_of_two_axis("BlockSize", range(6, 10, 2))
copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)]) copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)])
copy2_bench = nvbench.register(copy_type_sweep) copy2_bench = bench.register(copy_type_sweep)
copy2_bench.add_int64_axis("TypeID", range(0, 6)) copy2_bench.add_int64_axis("TypeID", range(0, 6))
nvbench.run_all_benchmarks(sys.argv) bench.run_all_benchmarks(sys.argv)

View File

@@ -1,31 +1,31 @@
import json import json
import cuda.nvbench as nvbench import cuda.bench as bench
import pytest import pytest
def test_cpp_exception(): def test_cpp_exception():
with pytest.raises(RuntimeError, match="Test"): with pytest.raises(RuntimeError, match="Test"):
nvbench._nvbench.test_cpp_exception() bench._nvbench.test_cpp_exception()
def test_py_exception(): def test_py_exception():
with pytest.raises(nvbench.NVBenchRuntimeError, match="Test"): with pytest.raises(bench.NVBenchRuntimeError, match="Test"):
nvbench._nvbench.test_py_exception() bench._nvbench.test_py_exception()
@pytest.mark.parametrize( @pytest.mark.parametrize(
"cls", [nvbench.CudaStream, nvbench.State, nvbench.Launch, nvbench.Benchmark] "cls", [bench.CudaStream, bench.State, bench.Launch, bench.Benchmark]
) )
def test_api_ctor(cls): def test_api_ctor(cls):
with pytest.raises(TypeError, match="No constructor defined!"): with pytest.raises(TypeError, match="No constructor defined!"):
cls() cls()
def t_bench(state: nvbench.State): def t_bench(state: bench.State):
s = {"a": 1, "b": 0.5, "c": "test", "d": {"a": 1}} s = {"a": 1, "b": 0.5, "c": "test", "d": {"a": 1}}
def launcher(launch: nvbench.Launch): def launcher(launch: bench.Launch):
for _ in range(10000): for _ in range(10000):
_ = json.dumps(s) _ = json.dumps(s)
@@ -33,7 +33,7 @@ def t_bench(state: nvbench.State):
def test_cpu_only(): def test_cpu_only():
b = nvbench.register(t_bench) b = bench.register(t_bench)
b.set_is_cpu_only(True) b.set_is_cpu_only(True)
nvbench.run_all_benchmarks(["-q", "--profile"]) bench.run_all_benchmarks(["-q", "--profile"])