mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-03-14 20:27:24 +00:00
Merge pull request #237 from oleksandr-pavlyk/add-pynvbench
Python package pynvbench introduced that exposes `cuda.bench` namespace. Repository provides a set of examples.
This commit is contained in:
4
python/.gitignore
vendored
Normal file
4
python/.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
build
|
||||
nvbench_build
|
||||
nvbench_install
|
||||
__pycache__
|
||||
41
python/CMakeLists.txt
Normal file
41
python/CMakeLists.txt
Normal file
@@ -0,0 +1,41 @@
|
||||
cmake_minimum_required(VERSION 3.30...4.0)
|
||||
|
||||
# CUDA is transitive dependency of nvbench
|
||||
project(${SKBUILD_PROJECT_NAME} LANGUAGES CXX CUDA)
|
||||
|
||||
find_package(Python REQUIRED COMPONENTS Development.Module)
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
|
||||
# Get CMake package manager
|
||||
set(_cpm_download_location ${CMAKE_CURRENT_BINARY_DIR}/cmake/CPM.cmake)
|
||||
file(
|
||||
DOWNLOAD
|
||||
https://github.com/cpm-cmake/CPM.cmake/releases/download/v0.42.0/CPM.cmake
|
||||
${_cpm_download_location}
|
||||
EXPECTED_HASH SHA256=2020b4fc42dba44817983e06342e682ecfc3d2f484a581f11cc5731fbe4dce8a
|
||||
)
|
||||
include(${_cpm_download_location})
|
||||
|
||||
CPMAddPackage(
|
||||
NAME nvbench
|
||||
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/..
|
||||
OPTIONS "NVBench_INSTALL_RULES ON"
|
||||
FIND_PACKAGE_ARGS CONFIG REQUIRED
|
||||
)
|
||||
|
||||
CPMAddPackage("gh:pybind/pybind11@3.0.0")
|
||||
|
||||
pybind11_add_module(_nvbench MODULE src/py_nvbench.cpp)
|
||||
target_link_libraries(_nvbench PUBLIC nvbench::nvbench)
|
||||
target_link_libraries(_nvbench PRIVATE CUDA::cudart_static)
|
||||
|
||||
set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN")
|
||||
set_target_properties(_nvbench PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)
|
||||
set_target_properties(_nvbench PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
install(TARGETS _nvbench DESTINATION cuda/bench)
|
||||
|
||||
# Determine target that nvbench::nvbench is an alias of,
|
||||
# necessary because ALIAS targets cannot be installed
|
||||
get_target_property(_aliased_target_name nvbench::nvbench ALIASED_TARGET)
|
||||
install(IMPORTED_RUNTIME_ARTIFACTS ${_aliased_target_name} DESTINATION cuda/bench)
|
||||
61
python/README.md
Normal file
61
python/README.md
Normal file
@@ -0,0 +1,61 @@
|
||||
# CUDA Kernel Benchmarking Package
|
||||
|
||||
This package provides Python API to CUDA Kernel Benchmarking Library `NVBench`.
|
||||
|
||||
## Building
|
||||
|
||||
### Ensure recent version of CMake
|
||||
|
||||
Since `nvbench` requires a rather new version of CMake (>=3.30.4), either build CMake from sources, or create a conda environment with a recent version of CMake, using
|
||||
|
||||
```
|
||||
conda create -n build_env --yes cmake ninja
|
||||
conda activate build_env
|
||||
```
|
||||
|
||||
### Ensure CUDA compiler
|
||||
|
||||
Since building `NVBench` library requires CUDA compiler, ensure that appropriate environment variables
|
||||
are set. For example, assuming CUDA toolkit is installed system-wide, and assuming Ampere GPU architecture:
|
||||
|
||||
```bash
|
||||
export CUDACXX=/usr/local/cuda/bin/nvcc
|
||||
export CUDAARCHS=86
|
||||
``
|
||||
|
||||
### Build Python project
|
||||
|
||||
Now switch to python folder, configure and install NVBench library, and install the package in editable mode:
|
||||
|
||||
```bash
|
||||
cd nvbench/python
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
### Verify that package works
|
||||
|
||||
```bash
|
||||
python test/run_1.py
|
||||
```
|
||||
|
||||
### Run examples
|
||||
|
||||
```bash
|
||||
# Example benchmarking numba.cuda kernel
|
||||
python examples/throughput.py
|
||||
```
|
||||
|
||||
```bash
|
||||
# Example benchmarking kernels authored using cuda.core
|
||||
python examples/axes.py
|
||||
```
|
||||
|
||||
```bash
|
||||
# Example benchmarking algorithms from cuda.cccl.parallel
|
||||
python examples/cccl_parallel_segmented_reduce.py
|
||||
```
|
||||
|
||||
```bash
|
||||
# Example benchmarking CuPy function
|
||||
python examples/cupy_extract.py
|
||||
```
|
||||
59
python/cuda/bench/__init__.py
Normal file
59
python/cuda/bench/__init__.py
Normal file
@@ -0,0 +1,59 @@
|
||||
# Copyright 2025 NVIDIA Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License.
|
||||
#
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import importlib.metadata
|
||||
import warnings
|
||||
|
||||
from cuda.pathfinder import ( # type: ignore[import-not-found]
|
||||
load_nvidia_dynamic_lib,
|
||||
)
|
||||
|
||||
try:
|
||||
__version__ = importlib.metadata.version("pynvbench")
|
||||
except Exception as e:
|
||||
__version__ = "0.0.0dev"
|
||||
warnings.warn(
|
||||
"Could not retrieve version of pynvbench package dynamically from its metadata. "
|
||||
f"Exception {e} was raised. "
|
||||
f"Version is set to fall-back value '{__version__}' instead."
|
||||
)
|
||||
|
||||
for libname in ("cupti", "nvperf_target", "nvperf_host"):
|
||||
load_nvidia_dynamic_lib(libname)
|
||||
|
||||
from cuda.bench._nvbench import ( # noqa: E402
|
||||
Benchmark as Benchmark,
|
||||
)
|
||||
from cuda.bench._nvbench import ( # noqa: E402
|
||||
CudaStream as CudaStream,
|
||||
)
|
||||
from cuda.bench._nvbench import ( # noqa: E402
|
||||
Launch as Launch,
|
||||
)
|
||||
from cuda.bench._nvbench import ( # noqa: E402
|
||||
NVBenchRuntimeError as NVBenchRuntimeError,
|
||||
)
|
||||
from cuda.bench._nvbench import ( # noqa: E402
|
||||
State as State,
|
||||
)
|
||||
from cuda.bench._nvbench import ( # noqa: E402
|
||||
register as register,
|
||||
)
|
||||
from cuda.bench._nvbench import ( # noqa: E402
|
||||
run_all_benchmarks as run_all_benchmarks,
|
||||
)
|
||||
|
||||
del load_nvidia_dynamic_lib
|
||||
312
python/cuda/bench/__init__.pyi
Normal file
312
python/cuda/bench/__init__.pyi
Normal file
@@ -0,0 +1,312 @@
|
||||
# Copyright 2025 NVIDIA Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License.
|
||||
#
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# ============================================
|
||||
# PLEASE KEEP IN SYNC WITH py_nvbench.cpp FILE
|
||||
# ============================================
|
||||
# Please be sure to keep these type hints and docstring in sync
|
||||
# with the pybind11 bindings in ``../../src/py_nvbench.cpp``
|
||||
|
||||
# Use mypy's stubgen to auto-generate stubs using
|
||||
# ``stubgen -m cuda.nvbench._nvbench`` and compare
|
||||
# stubs in generated out/cuda/nvbench/_nvbench.pyi
|
||||
# with definitions given here.
|
||||
|
||||
from collections.abc import Callable, Sequence
|
||||
from typing import Optional, Self, SupportsFloat, SupportsInt, Union
|
||||
|
||||
class CudaStream:
|
||||
"""Represents CUDA stream
|
||||
|
||||
Note
|
||||
----
|
||||
The class is not user-constructible.
|
||||
"""
|
||||
def __cuda_stream__(self) -> tuple[int, int]:
|
||||
"""
|
||||
Special method implement CUDA stream protocol
|
||||
from `cuda.core`. Returns a pair of integers:
|
||||
(protocol_version, integral_value_of_cudaStream_t pointer)
|
||||
|
||||
Example
|
||||
-------
|
||||
import cuda.core.experimental as core
|
||||
import cuda.bench as bench
|
||||
|
||||
def bench(state: bench.State):
|
||||
dev = core.Device(state.get_device())
|
||||
dev.set_current()
|
||||
# converts CudaString to core.Stream
|
||||
# using __cuda_stream__ protocol
|
||||
dev.create_stream(state.get_stream())
|
||||
"""
|
||||
...
|
||||
|
||||
def addressof(self) -> int:
|
||||
"Integral value of address of driver's CUDA stream struct"
|
||||
...
|
||||
|
||||
class Benchmark:
|
||||
"""Represents NVBench benchmark.
|
||||
|
||||
Note
|
||||
----
|
||||
The class is not user-constructible.
|
||||
|
||||
Use `~register` function to create Benchmark and register
|
||||
it with NVBench.
|
||||
"""
|
||||
def get_name(self) -> str:
|
||||
"Get benchmark name"
|
||||
...
|
||||
def add_int64_axis(self, name: str, values: Sequence[SupportsInt]) -> Self:
|
||||
"Add integral type parameter axis with given name and values to sweep over"
|
||||
...
|
||||
def add_int64_power_of_two_axis(
|
||||
self, name: str, values: Sequence[SupportsInt]
|
||||
) -> Self:
|
||||
"Add integral type parameter axis with given name and values to sweep over"
|
||||
...
|
||||
def add_float64_axis(self, name: str, values: Sequence[SupportsFloat]) -> Self:
|
||||
"Add floating-point type parameter axis with given name and values to sweep over"
|
||||
...
|
||||
def add_string_axis(self, name: str, values: Sequence[str]) -> Self:
|
||||
"Add string type parameter axis with given name and values to sweep over"
|
||||
...
|
||||
def set_name(self, name: str) -> Self:
|
||||
"Set benchmark name"
|
||||
...
|
||||
def set_is_cpu_only(self, is_cpu_only: bool) -> Self:
|
||||
"Set whether this benchmark only executes on CPU"
|
||||
...
|
||||
def set_run_once(self, v: bool) -> Self:
|
||||
"Set whether all benchmark configurations are executed only once"
|
||||
...
|
||||
def set_skip_time(self, duration_seconds: SupportsFloat) -> Self:
|
||||
"Set run durations, in seconds, that should be skipped"
|
||||
...
|
||||
def set_throttle_recovery_delay(self, delay_seconds: SupportsFloat) -> Self:
|
||||
"Set throttle recovery delay, in seconds"
|
||||
...
|
||||
def set_throttle_threshold(self, threshold: SupportsFloat) -> Self:
|
||||
"Set throttle threshold, as a fraction of maximal GPU frequency"
|
||||
...
|
||||
def set_timeout(self, duration_seconds: SupportsFloat) -> Self:
|
||||
"Set benchmark run duration timeout value, in seconds"
|
||||
...
|
||||
def set_stopping_criterion(self, criterion: str) -> Self:
|
||||
"Set stopping criterion to be used"
|
||||
...
|
||||
def set_criterion_param_float64(self, name: str, value: SupportsFloat) -> Self:
|
||||
"Set stopping criterion floating point parameter value"
|
||||
...
|
||||
def set_criterion_param_int64(self, name: str, value: SupportsInt) -> Self:
|
||||
"Set stopping criterion integer parameter value"
|
||||
...
|
||||
def set_criterion_param_string(self, name: str, value: str) -> Self:
|
||||
"Set stopping criterion string parameter value"
|
||||
...
|
||||
def set_min_samples(self, count: SupportsInt) -> Self:
|
||||
"Set minimal samples count before stopping criterion applies"
|
||||
...
|
||||
|
||||
class Launch:
|
||||
"""Configuration object for function launch.
|
||||
|
||||
Note
|
||||
----
|
||||
The class is not user-constructible.
|
||||
"""
|
||||
def get_stream(self) -> CudaStream:
|
||||
"Get CUDA stream of this configuration"
|
||||
...
|
||||
|
||||
class State:
|
||||
"""Represent benchmark configuration state.
|
||||
|
||||
Note
|
||||
----
|
||||
The class is not user-constructible.
|
||||
"""
|
||||
def has_device(self) -> bool:
|
||||
"True if configuration has a device"
|
||||
...
|
||||
def has_printers(self) -> bool:
|
||||
"True if configuration has a printer"
|
||||
...
|
||||
def get_device(self) -> Union[int, None]:
|
||||
"Get device_id of the device from this configuration"
|
||||
...
|
||||
def get_stream(self) -> CudaStream:
|
||||
"CudaStream object from this configuration"
|
||||
...
|
||||
def get_int64(self, name: str) -> int:
|
||||
"Get value for given Int64 axis from this configuration"
|
||||
...
|
||||
def get_int64_or_default(self, name: str, default_value: SupportsInt) -> int:
|
||||
"Get value for given Int64 axis from this configuration"
|
||||
...
|
||||
def get_float64(self, name: str) -> float:
|
||||
"Get value for given Float64 axis from this configuration"
|
||||
...
|
||||
def get_float64_or_default(self, name: str, default_value: SupportsFloat) -> float:
|
||||
"Get value for given Float64 axis from this configuration"
|
||||
...
|
||||
def get_string(self, name: str) -> str:
|
||||
"Get value for given String axis from this configuration"
|
||||
...
|
||||
def get_string_or_default(self, name: str, default_value: str) -> str:
|
||||
"Get value for given String axis from this configuration"
|
||||
...
|
||||
def add_element_count(
|
||||
self, count: SupportsInt, column_name: Optional[str] = None
|
||||
) -> None:
|
||||
"Add element count"
|
||||
...
|
||||
def set_element_count(self, count: SupportsInt) -> None:
|
||||
"Set element count"
|
||||
...
|
||||
def get_element_count(self) -> int:
|
||||
"Get element count"
|
||||
...
|
||||
def skip(self, reason: str) -> None:
|
||||
"Skip this configuration"
|
||||
...
|
||||
def is_skipped(self) -> bool:
|
||||
"Has this configuration been skipped"
|
||||
...
|
||||
def get_skip_reason(self) -> str:
|
||||
"Get reason provided for skipping this configuration"
|
||||
...
|
||||
def add_global_memory_reads(
|
||||
self, nbytes: SupportsInt, /, column_name: str = ""
|
||||
) -> None:
|
||||
"Inform NVBench that given amount of bytes is being read by the benchmark from global memory"
|
||||
...
|
||||
def add_global_memory_writes(
|
||||
self, nbytes: SupportsInt, /, column_name: str = ""
|
||||
) -> None:
|
||||
"Inform NVBench that given amount of bytes is being written by the benchmark into global memory"
|
||||
...
|
||||
def get_benchmark(self) -> Benchmark:
|
||||
"Get Benchmark this configuration is a part of"
|
||||
...
|
||||
def get_throttle_threshold(self) -> float:
|
||||
"Get throttle threshold value, as fraction of maximal frequency"
|
||||
...
|
||||
def set_throttle_threshold(self, threshold_fraction: SupportsFloat) -> None:
|
||||
"Set throttle threshold fraction to specified value, expected to be between 0 and 1"
|
||||
...
|
||||
def get_min_samples(self) -> int:
|
||||
"Get the number of benchmark timings NVBench performs before stopping criterion begins being used"
|
||||
...
|
||||
def set_min_samples(self, min_samples_count: SupportsInt) -> None:
|
||||
"Set the number of benchmark timings for NVBench to perform before stopping criterion begins being used"
|
||||
...
|
||||
def get_disable_blocking_kernel(self) -> bool:
|
||||
"True if use of blocking kernel by NVBench is disabled, False otherwise"
|
||||
...
|
||||
def set_disable_blocking_kernel(self, flag: bool) -> None:
|
||||
"Use flag = True to disable use of blocking kernel by NVBench"
|
||||
...
|
||||
def get_run_once(self) -> bool:
|
||||
"Boolean flag whether configuration should only run once"
|
||||
...
|
||||
def set_run_once(self, run_once_flag: bool) -> None:
|
||||
"Set run-once flag for this configuration"
|
||||
...
|
||||
def get_timeout(self) -> float:
|
||||
"Get time-out value for benchmark execution of this configuration, in seconds"
|
||||
...
|
||||
def set_timeout(self, duration: SupportsFloat) -> None:
|
||||
"Set time-out value for benchmark execution of this configuration, in seconds"
|
||||
...
|
||||
def get_blocking_kernel_timeout(self) -> float:
|
||||
"Get time-out value for execution of blocking kernel, in seconds"
|
||||
...
|
||||
def set_blocking_kernel_timeout(self, duration: SupportsFloat) -> None:
|
||||
"Set time-out value for execution of blocking kernel, in seconds"
|
||||
...
|
||||
def collect_cupti_metrics(self) -> None:
|
||||
"Request NVBench to record CUPTI metrics while running benchmark for this configuration"
|
||||
...
|
||||
def is_cupti_required(self) -> bool:
|
||||
"True if (some) CUPTI metrics are being collected"
|
||||
...
|
||||
def exec(
|
||||
self,
|
||||
fn: Callable[[Launch], None],
|
||||
/,
|
||||
*,
|
||||
batched: Optional[bool] = True,
|
||||
sync: Optional[bool] = False,
|
||||
):
|
||||
"""Execute callable running the benchmark.
|
||||
|
||||
The callable may be executed multiple times.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fn: Callable
|
||||
Python callable with signature fn(Launch) -> None that executes the benchmark.
|
||||
batched: bool, optional
|
||||
If `True`, no cache flushing is performed between callable invocations.
|
||||
Default: `True`.
|
||||
sync: bool, optional
|
||||
True value indicates that callable performs device synchronization.
|
||||
NVBench disables use of blocking kernel in this case.
|
||||
Default: `False`.
|
||||
"""
|
||||
...
|
||||
def get_short_description(self) -> str:
|
||||
"Get short description for this configuration"
|
||||
...
|
||||
def add_summary(
|
||||
self, column_name: str, value: Union[SupportsInt, SupportsFloat, str]
|
||||
) -> None:
|
||||
"Add summary column with a value"
|
||||
...
|
||||
def get_axis_values(self) -> dict[str, int | float | str]:
|
||||
"Get dictionary with axis values for this configuration"
|
||||
...
|
||||
def get_axis_values_as_string(self) -> str:
|
||||
"Get string of space-separated name=value pairs for this configuration"
|
||||
...
|
||||
def get_stopping_criterion(self) -> str:
|
||||
"Get string name of stopping criterion used"
|
||||
...
|
||||
|
||||
def register(fn: Callable[[State], None]) -> Benchmark:
|
||||
"""
|
||||
Register given benchmarking function with NVBench.
|
||||
"""
|
||||
...
|
||||
|
||||
def run_all_benchmarks(argv: Sequence[str]) -> None:
|
||||
"""
|
||||
Run all benchmarks registered with NVBench.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
argv: List[str]
|
||||
Sequence of CLI arguments controlling NVBench. Usually, it is `sys.argv`.
|
||||
"""
|
||||
...
|
||||
|
||||
class NVBenchRuntimeError(RuntimeError):
|
||||
"""An exception raised if running benchmarks encounters an error"""
|
||||
|
||||
...
|
||||
0
python/cuda/bench/py.typed
Normal file
0
python/cuda/bench/py.typed
Normal file
75
python/examples/auto_throughput.py
Normal file
75
python/examples/auto_throughput.py
Normal file
@@ -0,0 +1,75 @@
|
||||
# Copyright 2025 NVIDIA Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License.
|
||||
#
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
|
||||
import cuda.bench as bench
|
||||
import numpy as np
|
||||
from numba import cuda
|
||||
|
||||
|
||||
def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
|
||||
return cuda.external_stream(cs.addressof())
|
||||
|
||||
|
||||
def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher:
|
||||
@cuda.jit
|
||||
def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
|
||||
tid = cuda.grid(1)
|
||||
step = cuda.gridDim.x * cuda.blockDim.x
|
||||
for i in range(stride * tid, stride * elements, stride * step):
|
||||
for j in range(items_per_thread):
|
||||
read_id = (items_per_thread * i + j) % elements
|
||||
write_id = tid + j * elements
|
||||
out_arr[write_id] = in_arr[read_id]
|
||||
|
||||
return kernel
|
||||
|
||||
|
||||
def throughput_bench(state: bench.State) -> None:
|
||||
stride = state.get_int64("Stride")
|
||||
ipt = state.get_int64("ItemsPerThread")
|
||||
|
||||
nbytes = 128 * 1024 * 1024
|
||||
elements = nbytes // np.dtype(np.int32).itemsize
|
||||
|
||||
alloc_stream = as_cuda_stream(state.get_stream())
|
||||
inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
|
||||
out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
|
||||
|
||||
state.add_element_count(elements, column_name="Elements")
|
||||
state.collect_cupti_metrics()
|
||||
|
||||
threads_per_block = 256
|
||||
blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
|
||||
|
||||
krn = make_throughput_kernel(ipt)
|
||||
|
||||
def launcher(launch: bench.Launch):
|
||||
exec_stream = as_cuda_stream(launch.get_stream())
|
||||
krn[blocks_in_grid, threads_per_block, exec_stream, 0](
|
||||
stride, elements, inp_arr, out_arr
|
||||
)
|
||||
|
||||
state.exec(launcher)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
b = bench.register(throughput_bench)
|
||||
b.add_int64_axis("Stride", [1, 2, 4])
|
||||
b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
|
||||
|
||||
bench.run_all_benchmarks(sys.argv)
|
||||
206
python/examples/axes.py
Normal file
206
python/examples/axes.py
Normal file
@@ -0,0 +1,206 @@
|
||||
# Copyright 2025 NVIDIA Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License.
|
||||
#
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import ctypes
|
||||
import sys
|
||||
from typing import Dict, Optional, Tuple
|
||||
|
||||
import cuda.bench as bench
|
||||
import cuda.cccl.headers as headers
|
||||
import cuda.core.experimental as core
|
||||
|
||||
|
||||
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
|
||||
return core.Stream.from_handle(cs.addressof())
|
||||
|
||||
|
||||
def make_sleep_kernel():
|
||||
"""JITs sleep_kernel(seconds)"""
|
||||
src = r"""
|
||||
#include <cuda/std/cstdint>
|
||||
#include <cuda/std/chrono>
|
||||
|
||||
// Each launched thread just sleeps for `seconds`.
|
||||
__global__ void sleep_kernel(double seconds) {
|
||||
namespace chrono = ::cuda::std::chrono;
|
||||
using hr_clock = chrono::high_resolution_clock;
|
||||
|
||||
auto duration = static_cast<cuda::std::int64_t>(seconds * 1e9);
|
||||
const auto ns = chrono::nanoseconds(duration);
|
||||
|
||||
const auto start = hr_clock::now();
|
||||
const auto finish = start + ns;
|
||||
|
||||
auto now = hr_clock::now();
|
||||
while (now < finish)
|
||||
{
|
||||
now = hr_clock::now();
|
||||
}
|
||||
}
|
||||
"""
|
||||
incl = headers.get_include_paths()
|
||||
opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
|
||||
prog = core.Program(src, code_type="c++", options=opts)
|
||||
mod = prog.compile("cubin", name_expressions=("sleep_kernel",))
|
||||
return mod.get_kernel("sleep_kernel")
|
||||
|
||||
|
||||
def simple(state: bench.State):
|
||||
state.set_min_samples(1000)
|
||||
sleep_dur = 1e-3
|
||||
krn = make_sleep_kernel()
|
||||
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
|
||||
|
||||
def launcher(launch: bench.Launch):
|
||||
s = as_core_Stream(launch.get_stream())
|
||||
core.launch(s, launch_config, krn, sleep_dur)
|
||||
|
||||
state.exec(launcher)
|
||||
|
||||
|
||||
def single_float64_axis(state: bench.State):
|
||||
# get axis value, or default
|
||||
default_sleep_dur = 3.14e-4
|
||||
sleep_dur = state.get_float64_or_default("Duration", default_sleep_dur)
|
||||
krn = make_sleep_kernel()
|
||||
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
|
||||
|
||||
def launcher(launch: bench.Launch):
|
||||
s = as_core_Stream(launch.get_stream())
|
||||
core.launch(s, launch_config, krn, sleep_dur)
|
||||
|
||||
state.exec(launcher)
|
||||
|
||||
|
||||
def default_value(state: bench.State):
|
||||
single_float64_axis(state)
|
||||
|
||||
|
||||
def make_copy_kernel(in_type: Optional[str] = None, out_type: Optional[str] = None):
|
||||
src = r"""
|
||||
#include <cuda/std/cstdint>
|
||||
#include <cuda/std/cstddef>
|
||||
/*!
|
||||
* Naive copy of `n` values from `in` -> `out`.
|
||||
*/
|
||||
template <typename T, typename U>
|
||||
__global__ void copy_kernel(const T *in, U *out, ::cuda::std::size_t n)
|
||||
{
|
||||
const auto init = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const auto step = blockDim.x * gridDim.x;
|
||||
|
||||
for (auto i = init; i < n; i += step)
|
||||
{
|
||||
out[i] = static_cast<U>(in[i]);
|
||||
}
|
||||
}
|
||||
"""
|
||||
incl = headers.get_include_paths()
|
||||
opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
|
||||
prog = core.Program(src, code_type="c++", options=opts)
|
||||
if in_type is None:
|
||||
in_type = "::cuda::std::int32_t"
|
||||
if out_type is None:
|
||||
out_type = "::cuda::std::int32_t"
|
||||
instance_name = f"copy_kernel<{in_type}, {out_type}>"
|
||||
mod = prog.compile("cubin", name_expressions=(instance_name,))
|
||||
return mod.get_kernel(instance_name)
|
||||
|
||||
|
||||
def copy_sweep_grid_shape(state: bench.State):
|
||||
block_size = state.get_int64("BlockSize")
|
||||
num_blocks = state.get_int64("NumBlocks")
|
||||
|
||||
# Number of int32 elements in 256MiB
|
||||
nbytes = 256 * 1024 * 1024
|
||||
num_values = nbytes // ctypes.sizeof(ctypes.c_int32(0))
|
||||
|
||||
state.add_element_count(num_values)
|
||||
state.add_global_memory_reads(nbytes)
|
||||
state.add_global_memory_writes(nbytes)
|
||||
|
||||
dev_id = state.get_device()
|
||||
alloc_s = as_core_Stream(state.get_stream())
|
||||
input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
|
||||
output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
|
||||
|
||||
krn = make_copy_kernel()
|
||||
launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
|
||||
|
||||
def launcher(launch: bench.Launch):
|
||||
s = as_core_Stream(launch.get_stream())
|
||||
core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
|
||||
|
||||
state.exec(launcher)
|
||||
|
||||
|
||||
def copy_type_sweep(state: bench.State):
|
||||
type_id = state.get_int64("TypeID")
|
||||
|
||||
types_map: Dict[int, Tuple[type, str]] = {
|
||||
0: (ctypes.c_uint8, "cuda::std::uint8_t"),
|
||||
1: (ctypes.c_uint16, "cuda::std::uint16_t"),
|
||||
2: (ctypes.c_uint32, "cuda::std::uint32_t"),
|
||||
3: (ctypes.c_uint64, "cuda::std::uint64_t"),
|
||||
4: (ctypes.c_float, "float"),
|
||||
5: (ctypes.c_double, "double"),
|
||||
}
|
||||
|
||||
value_ctype, value_cuda_t = types_map[type_id]
|
||||
state.add_summary("Type", value_cuda_t)
|
||||
|
||||
# Number of elements in 256MiB
|
||||
nbytes = 256 * 1024 * 1024
|
||||
num_values = nbytes // ctypes.sizeof(value_ctype)
|
||||
|
||||
state.add_element_count(num_values)
|
||||
state.add_global_memory_reads(nbytes)
|
||||
state.add_global_memory_writes(nbytes)
|
||||
|
||||
dev_id = state.get_device()
|
||||
alloc_s = as_core_Stream(state.get_stream())
|
||||
input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
|
||||
output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
|
||||
|
||||
krn = make_copy_kernel(value_cuda_t, value_cuda_t)
|
||||
launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
|
||||
|
||||
def launcher(launch: bench.Launch):
|
||||
s = as_core_Stream(launch.get_stream())
|
||||
core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
|
||||
|
||||
state.exec(launcher)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Benchmark without axes
|
||||
bench.register(simple)
|
||||
|
||||
# benchmark with no axes, that uses default value
|
||||
bench.register(default_value)
|
||||
# specify axis
|
||||
bench.register(single_float64_axis).add_float64_axis(
|
||||
"Duration (s)", [7e-5, 1e-4, 5e-4]
|
||||
)
|
||||
|
||||
copy1_bench = bench.register(copy_sweep_grid_shape)
|
||||
copy1_bench.add_int64_axis("BlockSize", [2**x for x in range(6, 10, 2)])
|
||||
copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)])
|
||||
|
||||
copy2_bench = bench.register(copy_type_sweep)
|
||||
copy2_bench.add_int64_axis("TypeID", range(0, 6))
|
||||
|
||||
bench.run_all_benchmarks(sys.argv)
|
||||
99
python/examples/cccl_cooperative_block_reduce.py
Normal file
99
python/examples/cccl_cooperative_block_reduce.py
Normal file
@@ -0,0 +1,99 @@
|
||||
# Copyright 2025 NVIDIA Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License.
|
||||
#
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
|
||||
import cuda.bench as bench
|
||||
import cuda.cccl.cooperative.experimental as coop
|
||||
import numba
|
||||
import numpy as np
|
||||
from numba import cuda
|
||||
from pynvjitlink import patch
|
||||
|
||||
|
||||
class BitsetRing:
|
||||
"""
|
||||
Addition operation over ring fixed width unsigned integers
|
||||
with ring_plus = bitwise_or and ring_mul = bitwise_and,
|
||||
ring_zero = 0, ring_one = -1
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.dt = np.uint64
|
||||
self.zero = self.dt(0)
|
||||
self.one = np.bitwise_invert(self.zero)
|
||||
|
||||
@staticmethod
|
||||
def add(op1, op2):
|
||||
return op1 | op2
|
||||
|
||||
@staticmethod
|
||||
def mul(op1, op2):
|
||||
return op1 & op2
|
||||
|
||||
|
||||
def as_cuda_Stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
|
||||
return cuda.external_stream(cs.addressof())
|
||||
|
||||
|
||||
def multi_block_bench(state: bench.State):
|
||||
threads_per_block = state.get_int64("ThreadsPerBlock")
|
||||
num_blocks = state.get_int64("NumBlocks")
|
||||
total_elements = threads_per_block * num_blocks
|
||||
|
||||
if total_elements > 2**26:
|
||||
state.skip(reason="Memory footprint over threshold")
|
||||
return
|
||||
|
||||
ring = BitsetRing()
|
||||
block_reduce = coop.block.reduce(numba.uint64, threads_per_block, BitsetRing.add)
|
||||
|
||||
@cuda.jit(link=block_reduce.files)
|
||||
def kernel(inp_arr, out_arr):
|
||||
# Each thread contributes one element
|
||||
block_idx = cuda.blockIdx.x
|
||||
thread_idx = cuda.threadIdx.x
|
||||
global_idx = block_idx * threads_per_block + thread_idx
|
||||
|
||||
block_output = block_reduce(inp_arr[global_idx])
|
||||
|
||||
# Only thread 0 of each block writes the result
|
||||
if thread_idx == 0:
|
||||
out_arr[block_idx] = block_output
|
||||
|
||||
h_inp = np.arange(1, total_elements + 1, dtype=ring.dt)
|
||||
d_inp = cuda.to_device(h_inp)
|
||||
d_out = cuda.device_array(num_blocks, dtype=ring.dt)
|
||||
|
||||
state.add_element_count(total_elements)
|
||||
state.add_global_memory_reads(total_elements * h_inp.itemsize)
|
||||
state.add_global_memory_writes(num_blocks * h_inp.itemsize)
|
||||
|
||||
def launcher(launch: bench.Launch):
|
||||
cuda_s = as_cuda_Stream(launch.get_stream())
|
||||
kernel[num_blocks, threads_per_block, cuda_s, 0](d_inp, d_out)
|
||||
|
||||
state.exec(launcher)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
patch.patch_numba_linker(lto=True)
|
||||
|
||||
b = bench.register(multi_block_bench)
|
||||
b.add_int64_axis("ThreadsPerBlock", [64, 128, 192, 256])
|
||||
b.add_int64_power_of_two_axis("NumBlocks", [10, 11, 12, 14, 16])
|
||||
|
||||
bench.run_all_benchmarks(sys.argv)
|
||||
124
python/examples/cccl_parallel_segmented_reduce.py
Normal file
124
python/examples/cccl_parallel_segmented_reduce.py
Normal file
@@ -0,0 +1,124 @@
|
||||
# Copyright 2025 NVIDIA Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License.
|
||||
#
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
|
||||
import cuda.bench as bench
|
||||
import cuda.cccl.parallel.experimental.algorithms as algorithms
|
||||
import cuda.cccl.parallel.experimental.iterators as iterators
|
||||
import cuda.core.experimental as core
|
||||
import cupy as cp
|
||||
import numpy as np
|
||||
|
||||
|
||||
class CCCLStream:
|
||||
"Class to work around https://github.com/NVIDIA/cccl/issues/5144"
|
||||
|
||||
def __init__(self, ptr):
|
||||
self._ptr = ptr
|
||||
|
||||
def __cuda_stream__(self):
|
||||
return (0, self._ptr)
|
||||
|
||||
|
||||
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
|
||||
return core.Stream.from_handle(cs.addressof())
|
||||
|
||||
|
||||
def as_cccl_Stream(cs: bench.CudaStream) -> CCCLStream:
|
||||
return CCCLStream(cs.addressof())
|
||||
|
||||
|
||||
def as_cp_ExternalStream(
|
||||
cs: bench.CudaStream, dev_id: int | None = -1
|
||||
) -> cp.cuda.ExternalStream:
|
||||
h = cs.addressof()
|
||||
return cp.cuda.ExternalStream(h, dev_id)
|
||||
|
||||
|
||||
def segmented_reduce(state: bench.State):
|
||||
"Benchmark segmented_reduce example"
|
||||
n_elems = state.get_int64("numElems")
|
||||
n_cols = state.get_int64("numCols")
|
||||
n_rows = n_elems // n_cols
|
||||
|
||||
state.add_summary("numRows", n_rows)
|
||||
state.collect_cupti_metrics()
|
||||
|
||||
dev_id = state.get_device()
|
||||
cp_stream = as_cp_ExternalStream(state.get_stream(), dev_id)
|
||||
|
||||
def add_op(a, b):
|
||||
return a + b
|
||||
|
||||
def make_scaler(step):
|
||||
def scale(row_id):
|
||||
return row_id * step
|
||||
|
||||
return scale
|
||||
|
||||
zero = np.int32(0)
|
||||
row_offset = make_scaler(np.int32(n_cols))
|
||||
start_offsets = iterators.TransformIterator(
|
||||
iterators.CountingIterator(zero), row_offset
|
||||
)
|
||||
|
||||
end_offsets = start_offsets + 1
|
||||
|
||||
h_init = np.zeros(tuple(), dtype=np.int32)
|
||||
with cp_stream:
|
||||
rng = cp.random.default_rng()
|
||||
mat = rng.integers(low=-31, high=32, dtype=np.int32, size=(n_rows, n_cols))
|
||||
d_input = mat
|
||||
d_output = cp.empty(n_rows, dtype=d_input.dtype)
|
||||
|
||||
alg = algorithms.segmented_reduce(
|
||||
d_input, d_output, start_offsets, end_offsets, add_op, h_init
|
||||
)
|
||||
|
||||
cccl_stream = as_cccl_Stream(state.get_stream())
|
||||
|
||||
# query size of temporary storage and allocate
|
||||
temp_nbytes = alg(
|
||||
None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init, cccl_stream
|
||||
)
|
||||
h_init = np.zeros(tuple(), dtype=np.int32)
|
||||
|
||||
with cp_stream:
|
||||
temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
|
||||
|
||||
def launcher(launch: bench.Launch):
|
||||
s = as_cccl_Stream(launch.get_stream())
|
||||
alg(
|
||||
temp_storage,
|
||||
d_input,
|
||||
d_output,
|
||||
n_rows,
|
||||
start_offsets,
|
||||
end_offsets,
|
||||
h_init,
|
||||
s,
|
||||
)
|
||||
|
||||
state.exec(launcher)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
b = bench.register(segmented_reduce)
|
||||
b.add_int64_axis("numElems", [2**20, 2**22, 2**24])
|
||||
b.add_int64_axis("numCols", [1024, 2048, 4096, 8192])
|
||||
|
||||
bench.run_all_benchmarks(sys.argv)
|
||||
97
python/examples/cpu_activity.py
Normal file
97
python/examples/cpu_activity.py
Normal file
@@ -0,0 +1,97 @@
|
||||
# Copyright 2025 NVIDIA Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License.
|
||||
#
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
import time
|
||||
|
||||
import cuda.bench as bench
|
||||
import cuda.cccl.headers as headers
|
||||
import cuda.core.experimental as core
|
||||
|
||||
host_sleep_duration = 0.1
|
||||
|
||||
|
||||
def cpu_only_sleep_bench(state: bench.State) -> None:
|
||||
def launcher(launch: bench.Launch):
|
||||
time.sleep(host_sleep_duration)
|
||||
|
||||
state.exec(launcher)
|
||||
|
||||
|
||||
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
|
||||
return core.Stream.from_handle(cs.addressof())
|
||||
|
||||
|
||||
def make_sleep_kernel():
|
||||
"""JITs sleep_kernel(seconds)"""
|
||||
src = r"""
|
||||
#include <cuda/std/cstdint>
|
||||
#include <cuda/std/chrono>
|
||||
|
||||
// Each launched thread just sleeps for `seconds`.
|
||||
__global__ void sleep_kernel(double seconds) {
|
||||
namespace chrono = ::cuda::std::chrono;
|
||||
using hr_clock = chrono::high_resolution_clock;
|
||||
|
||||
auto duration = static_cast<cuda::std::int64_t>(seconds * 1e9);
|
||||
const auto ns = chrono::nanoseconds(duration);
|
||||
|
||||
const auto start = hr_clock::now();
|
||||
const auto finish = start + ns;
|
||||
|
||||
auto now = hr_clock::now();
|
||||
while (now < finish)
|
||||
{
|
||||
now = hr_clock::now();
|
||||
}
|
||||
}
|
||||
"""
|
||||
incl = headers.get_include_paths()
|
||||
opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
|
||||
prog = core.Program(src, code_type="c++", options=opts)
|
||||
mod = prog.compile("cubin", name_expressions=("sleep_kernel",))
|
||||
return mod.get_kernel("sleep_kernel")
|
||||
|
||||
|
||||
def mixed_sleep_bench(state: bench.State) -> None:
|
||||
sync = state.get_string("Sync")
|
||||
sync_flag = sync == "Do sync"
|
||||
|
||||
gpu_sleep_dur = 225e-3
|
||||
krn = make_sleep_kernel()
|
||||
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
|
||||
|
||||
def launcher(launch: bench.Launch):
|
||||
# host overhead
|
||||
time.sleep(host_sleep_duration)
|
||||
# GPU computation
|
||||
s = as_core_Stream(launch.get_stream())
|
||||
core.launch(s, launch_config, krn, gpu_sleep_dur)
|
||||
|
||||
state.exec(launcher, sync=sync_flag)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# time function only doing work (sleeping) on the host
|
||||
# using CPU timer only
|
||||
b = bench.register(cpu_only_sleep_bench)
|
||||
b.set_is_cpu_only(True)
|
||||
|
||||
# time the function that does work on both GPU and CPU
|
||||
b2 = bench.register(mixed_sleep_bench)
|
||||
b2.add_string_axis("Sync", ["Do not sync", "Do sync"])
|
||||
|
||||
bench.run_all_benchmarks(sys.argv)
|
||||
63
python/examples/cupy_extract.py
Normal file
63
python/examples/cupy_extract.py
Normal file
@@ -0,0 +1,63 @@
|
||||
# Copyright 2025 NVIDIA Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License.
|
||||
#
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
|
||||
import cuda.bench as bench
|
||||
import cupy as cp
|
||||
|
||||
|
||||
def as_cp_ExternalStream(
|
||||
cs: bench.CudaStream, dev_id: int | None = -1
|
||||
) -> cp.cuda.ExternalStream:
|
||||
h = cs.addressof()
|
||||
return cp.cuda.ExternalStream(h, dev_id)
|
||||
|
||||
|
||||
def cupy_extract_by_mask(state: bench.State):
|
||||
n_cols = state.get_int64("numCols")
|
||||
n_rows = state.get_int64("numRows")
|
||||
|
||||
dev_id = state.get_device()
|
||||
cp_s = as_cp_ExternalStream(state.get_stream(), dev_id)
|
||||
|
||||
state.collect_cupti_metrics()
|
||||
state.add_element_count(n_rows * n_cols, "# Elements")
|
||||
int32_dt = cp.dtype(cp.int32)
|
||||
bool_dt = cp.dtype(cp.bool_)
|
||||
state.add_global_memory_reads(
|
||||
n_rows * n_cols * (int32_dt.itemsize + bool_dt.itemsize)
|
||||
)
|
||||
state.add_global_memory_writes(n_rows * n_cols * (int32_dt.itemsize))
|
||||
|
||||
with cp_s:
|
||||
X = cp.full((n_cols, n_rows), fill_value=3, dtype=int32_dt)
|
||||
mask = cp.ones((n_cols, n_rows), dtype=bool_dt)
|
||||
_ = X[mask]
|
||||
|
||||
def launcher(launch: bench.Launch):
|
||||
with as_cp_ExternalStream(launch.get_stream(), dev_id):
|
||||
_ = X[mask]
|
||||
|
||||
state.exec(launcher, sync=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
b = bench.register(cupy_extract_by_mask)
|
||||
b.add_int64_axis("numCols", [1024, 2048, 4096, 2 * 4096])
|
||||
b.add_int64_axis("numRows", [1024, 2048, 4096, 2 * 4096])
|
||||
|
||||
bench.run_all_benchmarks(sys.argv)
|
||||
113
python/examples/cutlass_gemm.py
Normal file
113
python/examples/cutlass_gemm.py
Normal file
@@ -0,0 +1,113 @@
|
||||
# Copyright 2025 NVIDIA Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License.
|
||||
#
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import sys
|
||||
|
||||
import cuda.bench as bench
|
||||
import cuda.bindings.driver as driver
|
||||
import cuda.core.experimental as core
|
||||
import cupy as cp
|
||||
import cutlass
|
||||
import numpy as np
|
||||
|
||||
|
||||
def as_bindings_Stream(cs: bench.CudaStream) -> driver.CUstream:
|
||||
return driver.CUstream(cs.addressof())
|
||||
|
||||
|
||||
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
|
||||
return core.Stream.from_handle(cs.addressof())
|
||||
|
||||
|
||||
def make_cp_array(
|
||||
arr_h: np.ndarray, dev_buf: core.Buffer, dev_id: int | None
|
||||
) -> cp.ndarray:
|
||||
cp_memview = cp.cuda.UnownedMemory(
|
||||
int(dev_buf.handle), dev_buf.size, dev_buf, -1 if dev_id is None else dev_id
|
||||
)
|
||||
zero_offset = 0
|
||||
return cp.ndarray(
|
||||
arr_h.shape,
|
||||
dtype=arr_h.dtype,
|
||||
memptr=cp.cuda.MemoryPointer(cp_memview, zero_offset),
|
||||
)
|
||||
|
||||
|
||||
def cutlass_gemm(state: bench.State) -> None:
|
||||
n = state.get_int64("N")
|
||||
r = state.get_int64("R")
|
||||
|
||||
alpha = state.get_float64("alpha")
|
||||
|
||||
dt = np.float64
|
||||
A_h = np.random.randn(n, r).astype(dt)
|
||||
B_h = np.copy(A_h.mT)
|
||||
C_h = np.eye(n, dtype=dt)
|
||||
D_h = np.zeros_like(C_h)
|
||||
|
||||
if n >= 1024:
|
||||
# allow more time for large inputs
|
||||
state.set_timeout(360)
|
||||
|
||||
dev_id = state.get_device()
|
||||
cs = state.get_stream()
|
||||
s = as_bindings_Stream(cs)
|
||||
core_s = as_core_Stream(cs)
|
||||
|
||||
A_d = core.DeviceMemoryResource(dev_id).allocate(A_h.nbytes, core_s)
|
||||
B_d = core.DeviceMemoryResource(dev_id).allocate(B_h.nbytes, core_s)
|
||||
C_d = core.DeviceMemoryResource(dev_id).allocate(C_h.nbytes, core_s)
|
||||
D_d = core.DeviceMemoryResource(dev_id).allocate(D_h.nbytes, core_s)
|
||||
|
||||
driver.cuMemcpyAsync(A_d.handle, A_h.ctypes.data, A_h.nbytes, s)
|
||||
driver.cuMemcpyAsync(B_d.handle, B_h.ctypes.data, B_h.nbytes, s)
|
||||
driver.cuMemcpyAsync(C_d.handle, C_h.ctypes.data, C_h.nbytes, s)
|
||||
driver.cuMemcpyAsync(D_d.handle, D_h.ctypes.data, D_h.nbytes, s)
|
||||
|
||||
A_cp = make_cp_array(A_h, A_d, dev_id)
|
||||
B_cp = make_cp_array(B_h, B_d, dev_id)
|
||||
C_cp = make_cp_array(C_h, C_d, dev_id)
|
||||
D_cp = make_cp_array(D_h, D_d, dev_id)
|
||||
|
||||
plan = cutlass.op.Gemm(
|
||||
A=A_cp,
|
||||
B=B_cp,
|
||||
C=C_cp,
|
||||
D=D_cp,
|
||||
element=dt,
|
||||
alpha=alpha,
|
||||
beta=1,
|
||||
layout=cutlass.LayoutType.RowMajor,
|
||||
)
|
||||
# warm-up to ensure compilation is not timed
|
||||
plan.run(stream=s)
|
||||
|
||||
def launcher(launch: bench.Launch) -> None:
|
||||
s = as_bindings_Stream(launch.get_stream())
|
||||
plan.run(stream=s, sync=False)
|
||||
|
||||
state.exec(launcher)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
gemm_b = bench.register(cutlass_gemm)
|
||||
gemm_b.add_int64_axis("R", [16, 64, 256])
|
||||
gemm_b.add_int64_axis("N", [256, 512, 1024, 2048])
|
||||
|
||||
gemm_b.add_float64_axis("alpha", [1e-2])
|
||||
|
||||
bench.run_all_benchmarks(sys.argv)
|
||||
85
python/examples/exec_tag_sync.py
Normal file
85
python/examples/exec_tag_sync.py
Normal file
@@ -0,0 +1,85 @@
|
||||
# Copyright 2025 NVIDIA Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License.
|
||||
#
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import ctypes
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import cuda.bench as bench
|
||||
import cuda.cccl.headers as headers
|
||||
import cuda.core.experimental as core
|
||||
|
||||
|
||||
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
|
||||
"Create view of native stream used by NVBench"
|
||||
return core.Stream.from_handle(cs.addressof())
|
||||
|
||||
|
||||
def make_fill_kernel(data_type: Optional[str] = None):
|
||||
src = r"""
|
||||
#include <cuda/std/cstdint>
|
||||
#include <cuda/std/cstddef>
|
||||
/*!
|
||||
* Naive setting of values in buffer
|
||||
*/
|
||||
template <typename T>
|
||||
__global__ void fill_kernel(T *buf, T v, ::cuda::std::size_t n)
|
||||
{
|
||||
const auto init = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const auto step = blockDim.x * gridDim.x;
|
||||
|
||||
for (auto i = init; i < n; i += step)
|
||||
{
|
||||
buf[i] = v;
|
||||
}
|
||||
}
|
||||
"""
|
||||
incl = headers.get_include_paths()
|
||||
opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
|
||||
prog = core.Program(src, code_type="c++", options=opts)
|
||||
if data_type is None:
|
||||
data_type = "::cuda::std::int32_t"
|
||||
instance_name = f"fill_kernel<{data_type}>"
|
||||
mod = prog.compile("cubin", name_expressions=(instance_name,))
|
||||
return mod.get_kernel(instance_name)
|
||||
|
||||
|
||||
def synchronizing_bench(state: bench.State):
|
||||
n_values = 64 * 1024 * 1024
|
||||
n_bytes = n_values * ctypes.sizeof(ctypes.c_int32(0))
|
||||
|
||||
alloc_s = as_core_Stream(state.get_stream())
|
||||
buffer = core.DeviceMemoryResource(state.get_device()).allocate(n_bytes, alloc_s)
|
||||
|
||||
state.add_element_count(n_values, "Items")
|
||||
state.add_global_memory_writes(n_bytes, "Size")
|
||||
|
||||
krn = make_fill_kernel()
|
||||
launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
|
||||
|
||||
def launcher(launch: bench.Launch):
|
||||
s = as_core_Stream(launch.get_stream())
|
||||
core.launch(s, launch_config, krn, buffer, 0, n_values)
|
||||
s.sync()
|
||||
|
||||
# since launcher contains synchronization point,
|
||||
# setting sync=True is required to avoid a deadlock
|
||||
state.exec(launcher, sync=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
bench.register(synchronizing_bench)
|
||||
bench.run_all_benchmarks(sys.argv)
|
||||
58
python/examples/pytorch_bench.py
Normal file
58
python/examples/pytorch_bench.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import sys
|
||||
|
||||
import cuda.bench as bench
|
||||
import torch
|
||||
|
||||
|
||||
def as_torch_cuda_Stream(
|
||||
cs: bench.CudaStream, dev: int | None
|
||||
) -> torch.cuda.ExternalStream:
|
||||
return torch.cuda.ExternalStream(
|
||||
stream_ptr=cs.addressof(), device=torch.cuda.device(dev)
|
||||
)
|
||||
|
||||
|
||||
def torch_bench(state: bench.State) -> None:
|
||||
state.set_throttle_threshold(0.25)
|
||||
|
||||
dev_id = state.get_device()
|
||||
tc_s = as_torch_cuda_Stream(state.get_stream(), dev_id)
|
||||
|
||||
dt = torch.float32
|
||||
scalar_shape: tuple = tuple()
|
||||
n = 2**28
|
||||
with torch.cuda.stream(tc_s):
|
||||
a3 = torch.randn(scalar_shape, dtype=dt)
|
||||
a2 = torch.randn(scalar_shape, dtype=dt)
|
||||
a1 = torch.randn(scalar_shape, dtype=dt)
|
||||
a0 = torch.randn(scalar_shape, dtype=dt)
|
||||
x = torch.linspace(-3, 3, n, dtype=dt)
|
||||
y = torch.sin(x)
|
||||
|
||||
learning_rate = 1e-4
|
||||
|
||||
def launcher(launch: bench.Launch) -> None:
|
||||
tc_s = as_torch_cuda_Stream(launch.get_stream(), dev_id)
|
||||
with torch.cuda.stream(tc_s):
|
||||
x2 = torch.square(x)
|
||||
y_pred = (a3 + x2 * a1) + x * (a2 + a0 * x2)
|
||||
|
||||
_ = torch.square(y_pred - y).sum()
|
||||
grad_y_pred = 2 * (y_pred - y)
|
||||
grad_a3 = grad_y_pred.sum()
|
||||
grad_a2 = (grad_y_pred * x).sum()
|
||||
grad_a1 = (grad_y_pred * x2).sum()
|
||||
grad_a0 = (grad_y_pred * x2 * x).sum()
|
||||
|
||||
_ = a3 - grad_a3 * learning_rate
|
||||
_ = a2 - grad_a2 * learning_rate
|
||||
_ = a1 - grad_a1 * learning_rate
|
||||
_ = a0 - grad_a0 * learning_rate
|
||||
|
||||
state.exec(launcher, sync=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
bench.register(torch_bench)
|
||||
|
||||
bench.run_all_benchmarks(sys.argv)
|
||||
7
python/examples/requirements.txt
Normal file
7
python/examples/requirements.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
numpy
|
||||
numba
|
||||
cupy
|
||||
nvidia-cutlass
|
||||
cuda-cccl
|
||||
cuda-core
|
||||
cuda-bindings
|
||||
89
python/examples/skip.py
Normal file
89
python/examples/skip.py
Normal file
@@ -0,0 +1,89 @@
|
||||
# Copyright 2025 NVIDIA Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License.
|
||||
#
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
|
||||
import cuda.bench as bench
|
||||
import cuda.cccl.headers as headers
|
||||
import cuda.core.experimental as core
|
||||
|
||||
|
||||
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
|
||||
"Create view into native stream provided by NVBench"
|
||||
return core.Stream.from_handle(cs.addressof())
|
||||
|
||||
|
||||
def make_sleep_kernel():
|
||||
"""JITs sleep_kernel(seconds)"""
|
||||
src = r"""
|
||||
#include <cuda/std/cstdint>
|
||||
#include <cuda/std/chrono>
|
||||
|
||||
// Each launched thread just sleeps for `seconds`.
|
||||
__global__ void sleep_kernel(double seconds) {
|
||||
namespace chrono = ::cuda::std::chrono;
|
||||
using hr_clock = chrono::high_resolution_clock;
|
||||
|
||||
auto duration = static_cast<cuda::std::int64_t>(seconds * 1e9);
|
||||
const auto ns = chrono::nanoseconds(duration);
|
||||
|
||||
const auto start = hr_clock::now();
|
||||
const auto finish = start + ns;
|
||||
|
||||
auto now = hr_clock::now();
|
||||
while (now < finish)
|
||||
{
|
||||
now = hr_clock::now();
|
||||
}
|
||||
}
|
||||
"""
|
||||
incl = headers.get_include_paths()
|
||||
opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
|
||||
prog = core.Program(src, code_type="c++", options=opts)
|
||||
mod = prog.compile("cubin", name_expressions=("sleep_kernel",))
|
||||
return mod.get_kernel("sleep_kernel")
|
||||
|
||||
|
||||
def runtime_skip(state: bench.State):
|
||||
duration = state.get_float64("Duration")
|
||||
kramble = state.get_string("Kramble")
|
||||
|
||||
# Skip Baz benchmarks with 0.8 ms duration
|
||||
if kramble == "Baz" and duration < 0.8e-3:
|
||||
state.skip("Short 'Baz' benchmarks are skipped")
|
||||
return
|
||||
|
||||
# Skip Foo benchmark with > 0.3 ms duration
|
||||
if kramble == "Foo" and duration > 0.3e-3:
|
||||
state.skip("Long 'Foo' benchmarks are skipped")
|
||||
return
|
||||
|
||||
krn = make_sleep_kernel()
|
||||
launch_cfg = core.LaunchConfig(grid=1, block=1, shmem_size=0)
|
||||
|
||||
def launcher(launch: bench.Launch):
|
||||
s = as_core_Stream(launch.get_stream())
|
||||
core.launch(s, launch_cfg, krn, duration)
|
||||
|
||||
state.exec(launcher)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
b = bench.register(runtime_skip)
|
||||
b.add_float64_axis("Duration", [1e-4 + k * 0.25e-3 for k in range(5)])
|
||||
b.add_string_axis("Kramble", ["Foo", "Bar", "Baz"])
|
||||
|
||||
bench.run_all_benchmarks(sys.argv)
|
||||
76
python/examples/throughput.py
Normal file
76
python/examples/throughput.py
Normal file
@@ -0,0 +1,76 @@
|
||||
# Copyright 2025 NVIDIA Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License.
|
||||
#
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
|
||||
import cuda.bench as bench
|
||||
import numpy as np
|
||||
from numba import cuda
|
||||
|
||||
|
||||
def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
|
||||
return cuda.external_stream(cs.addressof())
|
||||
|
||||
|
||||
def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher:
|
||||
@cuda.jit
|
||||
def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
|
||||
tid = cuda.grid(1)
|
||||
step = cuda.gridDim.x * cuda.blockDim.x
|
||||
for i in range(stride * tid, stride * elements, stride * step):
|
||||
for j in range(items_per_thread):
|
||||
read_id = (items_per_thread * i + j) % elements
|
||||
write_id = tid + j * elements
|
||||
out_arr[write_id] = in_arr[read_id]
|
||||
|
||||
return kernel
|
||||
|
||||
|
||||
def throughput_bench(state: bench.State) -> None:
|
||||
stride = state.get_int64("Stride")
|
||||
ipt = state.get_int64("ItemsPerThread")
|
||||
|
||||
nbytes = 128 * 1024 * 1024
|
||||
elements = nbytes // np.dtype(np.int32).itemsize
|
||||
|
||||
alloc_stream = as_cuda_stream(state.get_stream())
|
||||
inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
|
||||
out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
|
||||
|
||||
state.add_element_count(elements, column_name="Elements")
|
||||
state.add_global_memory_reads(inp_arr.nbytes, column_name="Datasize")
|
||||
state.add_global_memory_writes(inp_arr.nbytes)
|
||||
|
||||
threads_per_block = 256
|
||||
blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
|
||||
|
||||
krn = make_throughput_kernel(ipt)
|
||||
|
||||
def launcher(launch: bench.Launch):
|
||||
exec_stream = as_cuda_stream(launch.get_stream())
|
||||
krn[blocks_in_grid, threads_per_block, exec_stream, 0](
|
||||
stride, elements, inp_arr, out_arr
|
||||
)
|
||||
|
||||
state.exec(launcher)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
b = bench.register(throughput_bench)
|
||||
b.add_int64_axis("Stride", [1, 2, 4])
|
||||
b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
|
||||
|
||||
bench.run_all_benchmarks(sys.argv)
|
||||
59
python/pyproject.toml
Normal file
59
python/pyproject.toml
Normal file
@@ -0,0 +1,59 @@
|
||||
[build-system]
|
||||
requires = ["scikit-build-core>=0.10", "setuptools_scm"]
|
||||
build-backend = "scikit_build_core.build"
|
||||
|
||||
[project]
|
||||
name = "pynvbench"
|
||||
description = "CUDA Kernel Benchmarking Package"
|
||||
authors = [{ name = "NVIDIA Corporation" }]
|
||||
classifiers = [
|
||||
"Programming Language :: Python :: 3 :: Only",
|
||||
"Environment :: GPU :: NVIDIA CUDA",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
]
|
||||
requires-python = ">=3.9"
|
||||
dependencies = [
|
||||
# pathfinder
|
||||
"cuda-pathfinder",
|
||||
|
||||
# Library expects to find shared libraries
|
||||
# libcupti, libnvperf_target, libnvperf_host
|
||||
# pathfinder is used to find it in the Python layout
|
||||
"nvidia-cuda-cupti-cu12",
|
||||
|
||||
# The shared library
|
||||
# libnvidia-ml must be installed system-wide
|
||||
# (Debian package provider: libnvidia-compute)
|
||||
]
|
||||
dynamic = ["version"]
|
||||
readme = { file = "README.md", content-type = "text/markdown" }
|
||||
|
||||
[project.optional-dependencies]
|
||||
test = ["pytest", "cupy-cuda12x", "numba"]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://developer.nvidia.com/"
|
||||
|
||||
[tool.scikit-build]
|
||||
minimum-version = "build-system.requires"
|
||||
build-dir = "build/{wheel_tag}"
|
||||
|
||||
[tool.scikit-build.cmake]
|
||||
version = ">=3.30.4"
|
||||
args = []
|
||||
build-type = "Release"
|
||||
source-dir = "."
|
||||
|
||||
[tool.scikit-build.ninja]
|
||||
version = ">=1.11"
|
||||
make-fallback = true
|
||||
|
||||
[tool.scikit-build.metadata.version]
|
||||
provider = "scikit_build_core.metadata.setuptools_scm"
|
||||
|
||||
[tool.setuptools_scm]
|
||||
root = ".."
|
||||
|
||||
[tool.scikit-build.wheel.packages]
|
||||
"cuda" = "cuda"
|
||||
"cuda/bench" = "cuda/bench"
|
||||
24
python/src/.BUILD_LOCALLY.md
Normal file
24
python/src/.BUILD_LOCALLY.md
Normal file
@@ -0,0 +1,24 @@
|
||||
# Building extension module locally
|
||||
|
||||
This may be useful for debugging purposes.
|
||||
Assuming NVBench is installed into `NVBENCH_PREFIX`,
|
||||
and pybind11 repo is cloned to `PYBIND11_DIR`,
|
||||
Python extension can be built locally, using host compiler,
|
||||
as follows:
|
||||
|
||||
```bash
|
||||
g++ py_nvbench.cpp \
|
||||
-shared -fPIC \
|
||||
-I ${PYBIND11_DIR}/include \
|
||||
-I ${NVBENCH_PREFIX}/include \
|
||||
-I /usr/local/cuda/include \
|
||||
$(python3-config --includes) \
|
||||
$(python3-config --libs) \
|
||||
-L ${NVBENCH_PREFIX}/lib/ \
|
||||
-lnvbench \
|
||||
-Wl,-rpath,${NVBENCH_PREFIX}/lib \
|
||||
-L /usr/local/cuda/lib64/ \
|
||||
-lcudart \
|
||||
-Wl,-rpath,/usr/local/cuda/lib64 \
|
||||
-o _nvbench$(python3-config --extension-suffix)
|
||||
```
|
||||
734
python/src/py_nvbench.cpp
Normal file
734
python/src/py_nvbench.cpp
Normal file
@@ -0,0 +1,734 @@
|
||||
/*
|
||||
* Copyright 2025 NVIDIA Corporation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License.
|
||||
*
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// clang-format off
|
||||
// Include Pybind11 headers first thing
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl.h>
|
||||
// clang-format on
|
||||
|
||||
#include <nvbench/nvbench.cuh>
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
struct PyObjectDeleter
|
||||
{
|
||||
void operator()(py::object *p)
|
||||
{
|
||||
const bool initialized = Py_IsInitialized();
|
||||
|
||||
#if PY_VERSION_HEX < 0x30d0000
|
||||
const bool finalizing = _Py_IsFinalizing();
|
||||
#else
|
||||
const bool finalizing = Py_IsFinalizing();
|
||||
#endif
|
||||
const bool guard = initialized && !finalizing;
|
||||
|
||||
// deleter only call ~object if interpreter is active and
|
||||
// not shutting down, let OS clean up resources after
|
||||
// interpreter tear-down
|
||||
if (guard)
|
||||
{
|
||||
delete p;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct benchmark_wrapper_t
|
||||
{
|
||||
|
||||
benchmark_wrapper_t() = default;
|
||||
|
||||
explicit benchmark_wrapper_t(py::object o)
|
||||
: m_fn{std::shared_ptr<py::object>(new py::object(std::move(o)), PyObjectDeleter{})}
|
||||
{
|
||||
if (!PyCallable_Check(m_fn->ptr()))
|
||||
{
|
||||
throw py::value_error("Argument must be a callable");
|
||||
}
|
||||
}
|
||||
|
||||
// Only copy constructor is used, delete copy-assign, and moves
|
||||
benchmark_wrapper_t(const benchmark_wrapper_t &other) = default;
|
||||
benchmark_wrapper_t &operator=(const benchmark_wrapper_t &other) = delete;
|
||||
benchmark_wrapper_t(benchmark_wrapper_t &&) noexcept = delete;
|
||||
benchmark_wrapper_t &operator=(benchmark_wrapper_t &&) noexcept = delete;
|
||||
|
||||
void operator()(nvbench::state &state, nvbench::type_list<>)
|
||||
{
|
||||
if (!m_fn)
|
||||
{
|
||||
throw std::runtime_error("No function to execute");
|
||||
}
|
||||
// box as Python object, using reference semantics
|
||||
auto arg = py::cast(std::ref(state), py::return_value_policy::reference);
|
||||
|
||||
// Execute Python callable
|
||||
(*m_fn)(arg);
|
||||
}
|
||||
|
||||
private:
|
||||
// Important to use shared pointer here rather than py::object directly,
|
||||
// since copy constructor must be const (benchmark::do_clone is const member method)
|
||||
std::shared_ptr<py::object> m_fn;
|
||||
};
|
||||
|
||||
// Use struct to ensure public inheritance
|
||||
struct nvbench_run_error : std::runtime_error
|
||||
{
|
||||
// ask compiler to generate all constructor signatures
|
||||
// that are defined for the base class
|
||||
using std::runtime_error::runtime_error;
|
||||
};
|
||||
py::handle benchmark_exc{};
|
||||
|
||||
class GlobalBenchmarkRegistry
|
||||
{
|
||||
bool m_finalized;
|
||||
|
||||
public:
|
||||
GlobalBenchmarkRegistry()
|
||||
: m_finalized(false) {};
|
||||
|
||||
GlobalBenchmarkRegistry(const GlobalBenchmarkRegistry &) = delete;
|
||||
GlobalBenchmarkRegistry &operator=(const GlobalBenchmarkRegistry &) = delete;
|
||||
|
||||
GlobalBenchmarkRegistry(GlobalBenchmarkRegistry &&) = delete;
|
||||
GlobalBenchmarkRegistry &operator=(GlobalBenchmarkRegistry &&) = delete;
|
||||
|
||||
bool is_finalized() const { return m_finalized; }
|
||||
|
||||
nvbench::benchmark_base &add_bench(py::object fn)
|
||||
{
|
||||
if (m_finalized)
|
||||
{
|
||||
throw std::runtime_error("Can not register more benchmarks after benchmark was run");
|
||||
}
|
||||
if (!PyCallable_Check(fn.ptr()))
|
||||
{
|
||||
throw py::value_error("Benchmark should be a callable object");
|
||||
}
|
||||
std::string name;
|
||||
if (py::hasattr(fn, "__name__"))
|
||||
{
|
||||
py::str py_name = fn.attr("__name__");
|
||||
name = py::cast<std::string>(py_name);
|
||||
}
|
||||
else
|
||||
{
|
||||
py::str py_name = py::repr(fn);
|
||||
name = py::cast<std::string>(py_name);
|
||||
}
|
||||
benchmark_wrapper_t executor(fn);
|
||||
|
||||
return nvbench::benchmark_manager::get()
|
||||
.add(std::make_unique<nvbench::benchmark<benchmark_wrapper_t>>(executor))
|
||||
.set_name(std::move(name));
|
||||
}
|
||||
|
||||
void run(const std::vector<std::string> &argv)
|
||||
{
|
||||
if (nvbench::benchmark_manager::get().get_benchmarks().empty())
|
||||
{
|
||||
throw std::runtime_error("No benchmarks had been registered yet");
|
||||
}
|
||||
if (m_finalized)
|
||||
{
|
||||
throw std::runtime_error("Benchmarks were already executed");
|
||||
}
|
||||
m_finalized = true;
|
||||
|
||||
try
|
||||
{
|
||||
// This line is mandatory for correctness to populate
|
||||
// benchmark with devices requested by user via CLI
|
||||
nvbench::benchmark_manager::get().initialize();
|
||||
{
|
||||
nvbench::option_parser parser{};
|
||||
parser.parse(argv);
|
||||
|
||||
NVBENCH_MAIN_PRINT_PREAMBLE(parser);
|
||||
NVBENCH_MAIN_RUN_BENCHMARKS(parser);
|
||||
NVBENCH_MAIN_PRINT_EPILOGUE(parser);
|
||||
|
||||
NVBENCH_MAIN_PRINT_RESULTS(parser);
|
||||
} /* Tear down parser before finalization */
|
||||
}
|
||||
catch (py::error_already_set &e)
|
||||
{
|
||||
py::raise_from(e, benchmark_exc.ptr(), "Python error raised ");
|
||||
throw py::error_already_set();
|
||||
}
|
||||
catch (const std::exception &e)
|
||||
{
|
||||
const std::string &exc_message = e.what();
|
||||
py::set_error(benchmark_exc, exc_message.c_str());
|
||||
throw py::error_already_set();
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
py::set_error(benchmark_exc, "Caught unknown exception in nvbench_main");
|
||||
throw py::error_already_set();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
py::dict py_get_axis_values(const nvbench::state &state)
|
||||
{
|
||||
auto named_values = state.get_axis_values();
|
||||
|
||||
auto names = named_values.get_names();
|
||||
py::dict res;
|
||||
|
||||
for (const auto &name : names)
|
||||
{
|
||||
if (named_values.has_value(name))
|
||||
{
|
||||
auto v = named_values.get_value(name);
|
||||
res[name.c_str()] = py::cast(v);
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// essentially a global variable, but allocated on the heap during module initialization
|
||||
std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete> global_registry{};
|
||||
|
||||
} // end of anonymous namespace
|
||||
|
||||
// ==========================================
|
||||
// PLEASE KEEP IN SYNC WITH __init__.pyi FILE
|
||||
// ==========================================
|
||||
// If you modify these bindings, please be sure to update the
|
||||
// corresponding type hints in ``../cuda/nvbench/__init__.pyi``
|
||||
|
||||
PYBIND11_MODULE(_nvbench, m)
|
||||
{
|
||||
// == STEP 1
|
||||
// Set environment variable CUDA_MODULE_LOADING=EAGER
|
||||
|
||||
NVBENCH_DRIVER_API_CALL(cuInit(0));
|
||||
|
||||
// This line ensures that benchmark_manager has been created during module init
|
||||
// It is reinitialized before running all benchmarks to set devices to use
|
||||
nvbench::benchmark_manager::get().initialize();
|
||||
|
||||
// == STEP 2
|
||||
// Define CudaStream class
|
||||
// ATTN: nvbench::cuda_stream is move-only class
|
||||
// Methods:
|
||||
// Constructors, based on device, or on existing stream
|
||||
// nvbench::cuda_stream::get_stream
|
||||
|
||||
auto py_cuda_stream_cls = py::class_<nvbench::cuda_stream>(m, "CudaStream");
|
||||
|
||||
py_cuda_stream_cls.def("__cuda_stream__",
|
||||
[](const nvbench::cuda_stream &s) -> std::pair<std::size_t, std::size_t> {
|
||||
return std::make_pair(std::size_t{0},
|
||||
reinterpret_cast<std::size_t>(s.get_stream()));
|
||||
});
|
||||
py_cuda_stream_cls.def("addressof", [](const nvbench::cuda_stream &s) -> std::size_t {
|
||||
return reinterpret_cast<std::size_t>(s.get_stream());
|
||||
});
|
||||
|
||||
// == STEP 3
|
||||
// Define Launch class
|
||||
// ATTN: nvbench::launch is move-only class
|
||||
// Methods:
|
||||
// nvbench::launch::get_stream -> nvbench::cuda_stream
|
||||
|
||||
auto py_launch_cls = py::class_<nvbench::launch>(m, "Launch");
|
||||
|
||||
py_launch_cls.def(
|
||||
"get_stream",
|
||||
[](nvbench::launch &launch) { return std::ref(launch.get_stream()); },
|
||||
py::return_value_policy::reference);
|
||||
|
||||
// == STEP 4
|
||||
// Define Benchmark class
|
||||
// ATTN: nvbench::benchmark_base is move-only class
|
||||
// Methods:
|
||||
// nvbench::benchmark_base::get_name
|
||||
// nvbench::benchmark_base::add_int64_axis
|
||||
// nvbench::benchmark_base::add_int64_power_of_two_axis
|
||||
// nvbench::benchmark_base::add_float64_axis
|
||||
// nvbench::benchmark_base::add_string_axis
|
||||
// nvbench::benchmark_base::set_name
|
||||
// nvbench::benchmark_base::set_is_cpu_only
|
||||
// nvbench::benchmark_base::set_skip_time
|
||||
// nvbench::benchmark_base::set_timeout
|
||||
// nvbench::benchmark_base::set_throttle_threshold
|
||||
// nvbench::benchmark_base::set_throttle_recovery_delay
|
||||
// nvbench::benchmark_base::set_stopping_criterion
|
||||
// nvbench::benchmark_base::set_criterion_param_int64
|
||||
// nvbench::benchmark_base::set_criterion_param_float64
|
||||
// nvbench::benchmark_base::set_criterion_param_string
|
||||
// nvbench::benchmark_base::set_min_samples
|
||||
|
||||
auto py_benchmark_cls = py::class_<nvbench::benchmark_base>(m, "Benchmark");
|
||||
py_benchmark_cls.def("get_name", &nvbench::benchmark_base::get_name);
|
||||
py_benchmark_cls.def(
|
||||
"add_int64_axis",
|
||||
[](nvbench::benchmark_base &self, std::string name, std::vector<nvbench::int64_t> data) {
|
||||
self.add_int64_axis(std::move(name), std::move(data));
|
||||
return std::ref(self);
|
||||
},
|
||||
py::return_value_policy::reference,
|
||||
py::arg("name"),
|
||||
py::arg("values"));
|
||||
py_benchmark_cls.def(
|
||||
"add_int64_power_of_two_axis",
|
||||
[](nvbench::benchmark_base &self, std::string name, std::vector<nvbench::int64_t> data) {
|
||||
self.add_int64_axis(std::move(name),
|
||||
std::move(data),
|
||||
nvbench::int64_axis_flags::power_of_two);
|
||||
return std::ref(self);
|
||||
},
|
||||
py::return_value_policy::reference,
|
||||
py::arg("name"),
|
||||
py::arg("values"));
|
||||
py_benchmark_cls.def(
|
||||
"add_float64_axis",
|
||||
[](nvbench::benchmark_base &self, std::string name, std::vector<nvbench::float64_t> data) {
|
||||
self.add_float64_axis(std::move(name), std::move(data));
|
||||
return std::ref(self);
|
||||
},
|
||||
py::return_value_policy::reference,
|
||||
py::arg("name"),
|
||||
py::arg("values"));
|
||||
py_benchmark_cls.def(
|
||||
"add_string_axis",
|
||||
[](nvbench::benchmark_base &self, std::string name, std::vector<std::string> data) {
|
||||
self.add_string_axis(std::move(name), std::move(data));
|
||||
return std::ref(self);
|
||||
},
|
||||
py::return_value_policy::reference,
|
||||
py::arg("name"),
|
||||
py::arg("values"));
|
||||
py_benchmark_cls.def(
|
||||
"set_name",
|
||||
[](nvbench::benchmark_base &self, std::string name) {
|
||||
self.set_name(std::move(name));
|
||||
return std::ref(self);
|
||||
},
|
||||
py::return_value_policy::reference,
|
||||
py::arg("name"));
|
||||
py_benchmark_cls.def(
|
||||
"set_is_cpu_only",
|
||||
[](nvbench::benchmark_base &self, bool is_cpu_only) {
|
||||
self.set_is_cpu_only(is_cpu_only);
|
||||
return std::ref(self);
|
||||
},
|
||||
py::return_value_policy::reference,
|
||||
py::arg("is_cpu_only"));
|
||||
// TODO: should this be exposed?
|
||||
py_benchmark_cls.def(
|
||||
"set_run_once",
|
||||
[](nvbench::benchmark_base &self, bool run_once) {
|
||||
self.set_run_once(run_once);
|
||||
return std::ref(self);
|
||||
},
|
||||
py::return_value_policy::reference,
|
||||
py::arg("run_once"));
|
||||
py_benchmark_cls.def(
|
||||
"set_skip_time",
|
||||
[](nvbench::benchmark_base &self, nvbench::float64_t skip_duration_seconds) {
|
||||
self.set_skip_time(skip_duration_seconds);
|
||||
return std::ref(self);
|
||||
},
|
||||
py::return_value_policy::reference,
|
||||
py::arg("duration_seconds"));
|
||||
py_benchmark_cls.def(
|
||||
"set_timeout",
|
||||
[](nvbench::benchmark_base &self, nvbench::float64_t duration_seconds) {
|
||||
self.set_timeout(duration_seconds);
|
||||
return std::ref(self);
|
||||
},
|
||||
py::return_value_policy::reference,
|
||||
py::arg("duration_seconds"));
|
||||
py_benchmark_cls.def(
|
||||
"set_throttle_threshold",
|
||||
[](nvbench::benchmark_base &self, nvbench::float32_t threshold) {
|
||||
self.set_throttle_threshold(threshold);
|
||||
return std::ref(self);
|
||||
},
|
||||
py::return_value_policy::reference,
|
||||
py::arg("threshold"));
|
||||
py_benchmark_cls.def(
|
||||
"set_throttle_recovery_delay",
|
||||
[](nvbench::benchmark_base &self, nvbench::float32_t delay) {
|
||||
self.set_throttle_recovery_delay(delay);
|
||||
return std::ref(self);
|
||||
},
|
||||
py::return_value_policy::reference,
|
||||
py::arg("delay_seconds"));
|
||||
py_benchmark_cls.def(
|
||||
"set_stopping_criterion",
|
||||
[](nvbench::benchmark_base &self, std::string criterion) {
|
||||
self.set_stopping_criterion(std::move(criterion));
|
||||
return std::ref(self);
|
||||
},
|
||||
py::return_value_policy::reference,
|
||||
py::arg("criterion"));
|
||||
py_benchmark_cls.def(
|
||||
"set_criterion_param_int64",
|
||||
[](nvbench::benchmark_base &self, std::string name, nvbench::int64_t value) {
|
||||
self.set_criterion_param_int64(std::move(name), value);
|
||||
return std::ref(self);
|
||||
},
|
||||
py::return_value_policy::reference,
|
||||
py::arg("name"),
|
||||
py::arg("value"));
|
||||
py_benchmark_cls.def(
|
||||
"set_criterion_param_float64",
|
||||
[](nvbench::benchmark_base &self, std::string name, nvbench::float64_t value) {
|
||||
self.set_criterion_param_float64(std::move(name), value);
|
||||
return std::ref(self);
|
||||
},
|
||||
py::return_value_policy::reference,
|
||||
py::arg("name"),
|
||||
py::arg("value"));
|
||||
py_benchmark_cls.def(
|
||||
"set_criterion_param_string",
|
||||
[](nvbench::benchmark_base &self, std::string name, std::string value) {
|
||||
self.set_criterion_param_string(std::move(name), std::move(value));
|
||||
return std::ref(self);
|
||||
},
|
||||
py::return_value_policy::reference,
|
||||
py::arg("name"),
|
||||
py::arg("value"));
|
||||
py_benchmark_cls.def(
|
||||
"set_min_samples",
|
||||
[](nvbench::benchmark_base &self, nvbench::int64_t count) {
|
||||
self.set_min_samples(count);
|
||||
return std::ref(self);
|
||||
},
|
||||
py::return_value_policy::reference,
|
||||
py::arg("min_samples_count"));
|
||||
|
||||
// == STEP 5
|
||||
// Define PyState class
|
||||
// ATTN: nvbench::state is move-only class
|
||||
// Methods:
|
||||
// nvbench::state::get_cuda_stream
|
||||
// nvbench::state::get_cuda_stream_optional
|
||||
// nvbench::state::set_cuda_stream
|
||||
// nvbench::state::get_device
|
||||
// nvbench::state::get_is_cpu_only
|
||||
// nvbench::state::get_type_config_index
|
||||
// nvbench::state::get_int64
|
||||
// nvbench::state::get_int64_or_default
|
||||
// nvbench::state::get_float64
|
||||
// nvbench::state::get_float64_or_default
|
||||
// nvbench::state::get_string
|
||||
// nvbench::state::get_string_or_default
|
||||
// nvbench::state::add_element_count
|
||||
// nvbench::state::set_element_count
|
||||
// nvbench::state::get_element_count
|
||||
// nvbench::state::add_global_memory_reads
|
||||
// nvbench::state::add_global_memory_writes
|
||||
// nvbench::state::add_buffer_size
|
||||
// nvbench::state::set_global_memory_rw_bytes
|
||||
// nvbench::state::get_global_memory_rw_bytes
|
||||
// nvbench::state::skip
|
||||
// nvbench::state::is_skipped
|
||||
// nvbench::state::get_skip_reason
|
||||
// nvbench::state::get_min_samples
|
||||
// nvbench::state::set_min_samples
|
||||
// nvbench::state::get_criterion_params
|
||||
// nvbench::state::get_stopping_criterion
|
||||
// nvbench::state::get_run_once
|
||||
// nvbench::state::set_run_once
|
||||
// nvbench::state::get_disable_blocking_kernel
|
||||
// nvbench::state::set_disable_blocking_kernel
|
||||
// nvbench::state::set_skip_time
|
||||
// nvbench::state::get_skip_time
|
||||
// nvbench::state::set_timeout
|
||||
// nvbench::state::get_timeout
|
||||
// nvbench::state::set_throttle_threshold
|
||||
// nvbench::state::get_throttle_threshold
|
||||
// nvbench::state::set_throttle_recovery_delay
|
||||
// nvbench::state::get_throttle_recovery_delay
|
||||
// nvbench::state::get_blocking_kernel_timeout
|
||||
// nvbench::state::set_blocking_kernel_timeout
|
||||
// nvbench::state::get_axis_values
|
||||
// nvbench::state::get_axis_values_as_string
|
||||
// nvbench::state::get_benchmark
|
||||
// nvbench::state::collect_l1_hit_rates
|
||||
// nvbench::state::collect_l2_hit_rates
|
||||
// nvbench::state::collect_stores_efficiency
|
||||
// nvbench::state::collect_loads_efficiency
|
||||
// nvbench::state::collect_dram_throughput
|
||||
// nvbench::state::collect_cupti_metrics
|
||||
// nvbench::state::is_l1_hit_rate_collected
|
||||
// nvbench::state::is_l2_hit_rate_collected
|
||||
// nvbench::state::is_stores_efficiency_collected
|
||||
// nvbench::state::is_loads_efficiency_collected
|
||||
// nvbench::state::is_dram_throughput_collected
|
||||
// nvbench::state::is_cupti_required
|
||||
// nvbench::state::add_summary
|
||||
// nvbench::state::get_summary
|
||||
// nvbench::state::get_summaries
|
||||
// nvbench::state::get_short_description
|
||||
// nvbench::state::exec
|
||||
// NOTE:
|
||||
// State wraps std::reference_wrapper<nvbench::state>
|
||||
|
||||
using state_ref_t = std::reference_wrapper<nvbench::state>;
|
||||
auto pystate_cls = py::class_<nvbench::state>(m, "State");
|
||||
|
||||
pystate_cls.def("has_device", [](const nvbench::state &state) -> bool {
|
||||
return static_cast<bool>(state.get_device());
|
||||
});
|
||||
pystate_cls.def("has_printers", [](const nvbench::state &state) -> bool {
|
||||
return state.get_benchmark().get_printer().has_value();
|
||||
});
|
||||
pystate_cls.def("get_device", [](const nvbench::state &state) {
|
||||
auto dev = state.get_device();
|
||||
if (dev.has_value())
|
||||
{
|
||||
return py::cast(dev.value().get_id());
|
||||
}
|
||||
return py::object(py::none());
|
||||
});
|
||||
|
||||
pystate_cls.def(
|
||||
"get_stream",
|
||||
[](nvbench::state &state) { return std::ref(state.get_cuda_stream()); },
|
||||
py::return_value_policy::reference);
|
||||
|
||||
pystate_cls.def("get_int64", &nvbench::state::get_int64, py::arg("name"));
|
||||
pystate_cls.def("get_int64_or_default",
|
||||
&nvbench::state::get_int64_or_default,
|
||||
py::arg("name"),
|
||||
py::pos_only{},
|
||||
py::arg("default_value"));
|
||||
|
||||
pystate_cls.def("get_float64", &nvbench::state::get_float64, py::arg("name"));
|
||||
pystate_cls.def("get_float64_or_default",
|
||||
&nvbench::state::get_float64_or_default,
|
||||
py::arg("name"),
|
||||
py::pos_only{},
|
||||
py::arg("default_value"));
|
||||
|
||||
pystate_cls.def("get_string", &nvbench::state::get_string, py::arg("name"));
|
||||
pystate_cls.def("get_string_or_default",
|
||||
&nvbench::state::get_string_or_default,
|
||||
py::arg("name"),
|
||||
py::pos_only{},
|
||||
py::arg("default_value"));
|
||||
|
||||
pystate_cls.def("add_element_count",
|
||||
&nvbench::state::add_element_count,
|
||||
py::arg("count"),
|
||||
py::arg("column_name") = py::str(""));
|
||||
pystate_cls.def("set_element_count", &nvbench::state::set_element_count, py::arg("count"));
|
||||
pystate_cls.def("get_element_count", &nvbench::state::get_element_count);
|
||||
|
||||
pystate_cls.def("skip", &nvbench::state::skip, py::arg("reason"));
|
||||
pystate_cls.def("is_skipped", &nvbench::state::is_skipped);
|
||||
pystate_cls.def("get_skip_reason", &nvbench::state::get_skip_reason);
|
||||
|
||||
pystate_cls.def(
|
||||
"add_global_memory_reads",
|
||||
[](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void {
|
||||
state.add_global_memory_reads(nbytes, column_name);
|
||||
},
|
||||
"Add size, in bytes, of global memory reads",
|
||||
py::arg("nbytes"),
|
||||
py::pos_only{},
|
||||
py::arg("column_name") = py::str(""));
|
||||
pystate_cls.def(
|
||||
"add_global_memory_writes",
|
||||
[](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void {
|
||||
state.add_global_memory_writes(nbytes, column_name);
|
||||
},
|
||||
"Add size, in bytes, of global memory writes",
|
||||
py::arg("nbytes"),
|
||||
py::pos_only{},
|
||||
py::arg("column_name") = py::str(""));
|
||||
pystate_cls.def(
|
||||
"get_benchmark",
|
||||
[](const nvbench::state &state) { return std::ref(state.get_benchmark()); },
|
||||
py::return_value_policy::reference);
|
||||
pystate_cls.def("get_throttle_threshold", &nvbench::state::get_throttle_threshold);
|
||||
pystate_cls.def("set_throttle_threshold",
|
||||
&nvbench::state::set_throttle_threshold,
|
||||
py::arg("throttle_fraction"));
|
||||
|
||||
pystate_cls.def("get_min_samples", &nvbench::state::get_min_samples);
|
||||
pystate_cls.def("set_min_samples",
|
||||
&nvbench::state::set_min_samples,
|
||||
py::arg("min_samples_count"));
|
||||
|
||||
pystate_cls.def("get_disable_blocking_kernel", &nvbench::state::get_disable_blocking_kernel);
|
||||
pystate_cls.def("set_disable_blocking_kernel",
|
||||
&nvbench::state::set_disable_blocking_kernel,
|
||||
py::arg("disable_blocking_kernel"));
|
||||
|
||||
pystate_cls.def("get_run_once", &nvbench::state::get_run_once);
|
||||
pystate_cls.def("set_run_once", &nvbench::state::set_run_once, py::arg("run_once"));
|
||||
|
||||
pystate_cls.def("get_timeout", &nvbench::state::get_timeout);
|
||||
pystate_cls.def("set_timeout", &nvbench::state::set_timeout, py::arg("duration"));
|
||||
|
||||
pystate_cls.def("get_blocking_kernel_timeout", &nvbench::state::get_blocking_kernel_timeout);
|
||||
pystate_cls.def("set_blocking_kernel_timeout",
|
||||
&nvbench::state::set_blocking_kernel_timeout,
|
||||
py::arg("duration"));
|
||||
|
||||
pystate_cls.def("collect_cupti_metrics", &nvbench::state::collect_cupti_metrics);
|
||||
pystate_cls.def("is_cupti_required", &nvbench::state::is_cupti_required);
|
||||
|
||||
pystate_cls.def(
|
||||
"exec",
|
||||
[](nvbench::state &state, py::object py_launcher_fn, bool batched, bool sync) {
|
||||
if (!PyCallable_Check(py_launcher_fn.ptr()))
|
||||
{
|
||||
throw py::type_error("Argument of exec method must be a callable object");
|
||||
}
|
||||
|
||||
// wrapper to invoke Python callable
|
||||
auto cpp_launcher_fn = [py_launcher_fn](nvbench::launch &launch_descr) -> void {
|
||||
// cast C++ object to python object
|
||||
auto launch_pyarg = py::cast(std::ref(launch_descr), py::return_value_policy::reference);
|
||||
// call Python callable
|
||||
py_launcher_fn(launch_pyarg);
|
||||
};
|
||||
|
||||
if (sync)
|
||||
{
|
||||
if (batched)
|
||||
{
|
||||
constexpr auto tag = nvbench::exec_tag::sync;
|
||||
state.exec(tag, cpp_launcher_fn);
|
||||
}
|
||||
else
|
||||
{
|
||||
constexpr auto tag = nvbench::exec_tag::sync | nvbench::exec_tag::no_batch;
|
||||
state.exec(tag, cpp_launcher_fn);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (batched)
|
||||
{
|
||||
constexpr auto tag = nvbench::exec_tag::none;
|
||||
state.exec(tag, cpp_launcher_fn);
|
||||
}
|
||||
else
|
||||
{
|
||||
constexpr auto tag = nvbench::exec_tag::no_batch;
|
||||
state.exec(tag, cpp_launcher_fn);
|
||||
}
|
||||
}
|
||||
},
|
||||
"Executor for given launcher callable fn(state : Launch)",
|
||||
py::arg("launcher_fn"),
|
||||
py::pos_only{},
|
||||
py::arg("batched") = true,
|
||||
py::arg("sync") = false);
|
||||
|
||||
pystate_cls.def("get_short_description",
|
||||
[](const nvbench::state &state) { return state.get_short_description(); });
|
||||
|
||||
pystate_cls.def(
|
||||
"add_summary",
|
||||
[](nvbench::state &state, std::string column_name, std::string value) {
|
||||
auto &summ = state.add_summary("nv/python/" + column_name);
|
||||
summ.set_string("description", "User tag: " + column_name);
|
||||
summ.set_string("name", std::move(column_name));
|
||||
summ.set_string("value", std::move(value));
|
||||
},
|
||||
py::arg("name"),
|
||||
py::arg("value"));
|
||||
pystate_cls.def(
|
||||
"add_summary",
|
||||
[](nvbench::state &state, std::string column_name, std::int64_t value) {
|
||||
auto &summ = state.add_summary("nv/python/" + column_name);
|
||||
summ.set_string("description", "User tag: " + column_name);
|
||||
summ.set_string("name", std::move(column_name));
|
||||
summ.set_int64("value", value);
|
||||
},
|
||||
py::arg("name"),
|
||||
py::arg("value"));
|
||||
pystate_cls.def(
|
||||
"add_summary",
|
||||
[](nvbench::state &state, std::string column_name, double value) {
|
||||
auto &summ = state.add_summary("nv/python/" + column_name);
|
||||
summ.set_string("description", "User tag: " + column_name);
|
||||
summ.set_string("name", std::move(column_name));
|
||||
summ.set_float64("value", value);
|
||||
},
|
||||
py::arg("name"),
|
||||
py::arg("value"));
|
||||
pystate_cls.def("get_axis_values_as_string",
|
||||
[](const nvbench::state &state) { return state.get_axis_values_as_string(); });
|
||||
pystate_cls.def("get_axis_values", &py_get_axis_values);
|
||||
pystate_cls.def("get_stopping_criterion", &nvbench::state::get_stopping_criterion);
|
||||
|
||||
// Use handle to take a memory leak here, since this object's destructor may be called after
|
||||
// interpreter has shut down
|
||||
benchmark_exc =
|
||||
py::exception<nvbench_run_error>(m, "NVBenchRuntimeError", PyExc_RuntimeError).release();
|
||||
// == STEP 6
|
||||
// ATTN: nvbench::benchmark_manager is a singleton
|
||||
|
||||
global_registry =
|
||||
std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete>(new GlobalBenchmarkRegistry(),
|
||||
py::nodelete{});
|
||||
|
||||
m.def(
|
||||
"register",
|
||||
[&](py::object fn) { return std::ref(global_registry->add_bench(fn)); },
|
||||
"Register benchmark function of type Callable[[nvbench.State], None]",
|
||||
py::return_value_policy::reference,
|
||||
py::arg("benchmark_fn"));
|
||||
|
||||
m.def(
|
||||
"run_all_benchmarks",
|
||||
[&](py::object argv) -> void {
|
||||
if (!py::isinstance<py::list>(argv))
|
||||
{
|
||||
throw py::type_error("run_all_benchmarks expects a list of command-line arguments");
|
||||
}
|
||||
std::vector<std::string> args = py::cast<std::vector<std::string>>(argv);
|
||||
global_registry->run(args);
|
||||
},
|
||||
"Run all registered benchmarks",
|
||||
py::arg("argv") = py::list());
|
||||
|
||||
m.def("test_cpp_exception", []() { throw nvbench_run_error("Test"); });
|
||||
m.def("test_py_exception", []() {
|
||||
py::set_error(benchmark_exc, "Test");
|
||||
throw py::error_already_set();
|
||||
});
|
||||
}
|
||||
119
python/test/run_1.py
Normal file
119
python/test/run_1.py
Normal file
@@ -0,0 +1,119 @@
|
||||
import sys
|
||||
|
||||
import cuda.bench as bench
|
||||
import numpy as np
|
||||
from numba import cuda
|
||||
|
||||
|
||||
@cuda.jit()
|
||||
def kernel(a, b, c):
|
||||
tid = cuda.grid(1)
|
||||
size = len(a)
|
||||
|
||||
if tid < size:
|
||||
c[tid] = a[tid] + b[tid]
|
||||
|
||||
|
||||
def get_numba_stream(launch: bench.Launch):
|
||||
return cuda.external_stream(launch.get_stream().addressof())
|
||||
|
||||
|
||||
def skipit(state: bench.State) -> None:
|
||||
state.skip("Skipping this benchmark for no reason")
|
||||
|
||||
|
||||
def add_two(state: bench.State):
|
||||
N = state.get_int64("elements")
|
||||
a = cuda.to_device(np.random.random(N))
|
||||
c = cuda.device_array_like(a)
|
||||
|
||||
assert "elements" in state.get_axis_values()
|
||||
assert "elements=" in state.get_axis_values_as_string()
|
||||
|
||||
state.add_global_memory_reads(a.nbytes)
|
||||
state.add_global_memory_writes(c.nbytes)
|
||||
|
||||
nthreads = 256
|
||||
nblocks = (len(a) + nthreads - 1) // nthreads
|
||||
|
||||
# First call locks, can't use async benchmarks until sync tag is supported
|
||||
kernel[nblocks, nthreads](a, a, c)
|
||||
cuda.synchronize()
|
||||
|
||||
def kernel_launcher(launch):
|
||||
stream = get_numba_stream(launch)
|
||||
kernel[nblocks, nthreads, stream](a, a, c)
|
||||
|
||||
state.exec(kernel_launcher, batched=True, sync=True)
|
||||
|
||||
|
||||
def add_float(state: bench.State):
|
||||
N = state.get_int64("elements")
|
||||
v = state.get_float64("v")
|
||||
name = state.get_string("name")
|
||||
a = cuda.to_device(np.random.random(N).astype(np.float32))
|
||||
b = cuda.to_device(np.random.random(N).astype(np.float32))
|
||||
c = cuda.device_array_like(a)
|
||||
|
||||
state.add_global_memory_reads(a.nbytes + b.nbytes)
|
||||
state.add_global_memory_writes(c.nbytes)
|
||||
|
||||
nthreads = 64
|
||||
nblocks = (len(a) + nthreads - 1) // nthreads
|
||||
|
||||
axis_values = state.get_axis_values()
|
||||
assert "elements" in axis_values
|
||||
assert "v" in axis_values
|
||||
assert "name" in axis_values
|
||||
assert axis_values["elements"] == N
|
||||
assert axis_values["v"] == v
|
||||
assert axis_values["name"] == name
|
||||
|
||||
def kernel_launcher(launch):
|
||||
_ = v
|
||||
_ = name
|
||||
stream = get_numba_stream(launch)
|
||||
kernel[nblocks, nthreads, stream](a, b, c)
|
||||
|
||||
state.exec(kernel_launcher, batched=True, sync=True)
|
||||
|
||||
|
||||
def add_three(state: bench.State):
|
||||
N = state.get_int64("elements")
|
||||
a = cuda.to_device(np.random.random(N).astype(np.float32))
|
||||
b = cuda.to_device(np.random.random(N).astype(np.float32))
|
||||
c = cuda.device_array_like(a)
|
||||
|
||||
state.add_global_memory_reads(a.nbytes + b.nbytes)
|
||||
state.add_global_memory_writes(c.nbytes)
|
||||
|
||||
nthreads = 256
|
||||
nblocks = (len(a) + nthreads - 1) // nthreads
|
||||
|
||||
def kernel_launcher(launch):
|
||||
stream = get_numba_stream(launch)
|
||||
kernel[nblocks, nthreads, stream](a, b, c)
|
||||
|
||||
state.exec(kernel_launcher, batched=True, sync=True)
|
||||
cuda.synchronize()
|
||||
|
||||
|
||||
def register_benchmarks():
|
||||
(
|
||||
bench.register(add_two).add_int64_axis(
|
||||
"elements", [2**pow2 - 1 for pow2 in range(20, 23)]
|
||||
)
|
||||
)
|
||||
(
|
||||
bench.register(add_float)
|
||||
.add_float64_axis("v", [0.1, 0.3])
|
||||
.add_string_axis("name", ["Anne", "Lynda"])
|
||||
.add_int64_power_of_two_axis("elements", range(20, 23))
|
||||
)
|
||||
bench.register(add_three).add_int64_power_of_two_axis("elements", range(20, 22))
|
||||
bench.register(skipit)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
register_benchmarks()
|
||||
bench.run_all_benchmarks(sys.argv)
|
||||
236
python/test/stubs.py
Normal file
236
python/test/stubs.py
Normal file
@@ -0,0 +1,236 @@
|
||||
# Copyright 2025 NVIDIA Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 with the LLVM exception
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License.
|
||||
#
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://llvm.org/foundation/relicensing/LICENSE.txt
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import ctypes
|
||||
import sys
|
||||
from typing import Dict, Optional, Tuple
|
||||
|
||||
import cuda.bench as bench
|
||||
import cuda.cccl.headers as headers
|
||||
import cuda.core.experimental as core
|
||||
|
||||
|
||||
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
|
||||
return core.Stream.from_handle(cs.addressof())
|
||||
|
||||
|
||||
def make_sleep_kernel():
|
||||
"""JITs sleep_kernel(seconds)"""
|
||||
src = r"""
|
||||
#include <cuda/std/cstdint>
|
||||
#include <cuda/std/chrono>
|
||||
|
||||
// Each launched thread just sleeps for `seconds`.
|
||||
__global__ void sleep_kernel(double seconds) {
|
||||
namespace chrono = ::cuda::std::chrono;
|
||||
using hr_clock = chrono::high_resolution_clock;
|
||||
|
||||
auto duration = static_cast<cuda::std::int64_t>(seconds * 1e9);
|
||||
const auto ns = chrono::nanoseconds(duration);
|
||||
|
||||
const auto start = hr_clock::now();
|
||||
const auto finish = start + ns;
|
||||
|
||||
auto now = hr_clock::now();
|
||||
while (now < finish)
|
||||
{
|
||||
now = hr_clock::now();
|
||||
}
|
||||
}
|
||||
"""
|
||||
incl = headers.get_include_paths()
|
||||
opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
|
||||
prog = core.Program(src, code_type="c++", options=opts)
|
||||
mod = prog.compile("cubin", name_expressions=("sleep_kernel",))
|
||||
return mod.get_kernel("sleep_kernel")
|
||||
|
||||
|
||||
def no_axes(state: bench.State):
|
||||
state.set_min_samples(1000)
|
||||
sleep_dur = 1e-3
|
||||
krn = make_sleep_kernel()
|
||||
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
|
||||
|
||||
print(f"Stopping criterion used: {state.get_stopping_criterion()}")
|
||||
|
||||
def launcher(launch: bench.Launch):
|
||||
s = as_core_Stream(launch.get_stream())
|
||||
core.launch(s, launch_config, krn, sleep_dur)
|
||||
|
||||
state.exec(launcher)
|
||||
|
||||
|
||||
def tags(state: bench.State):
|
||||
state.set_min_samples(1000)
|
||||
sleep_dur = 1e-3
|
||||
krn = make_sleep_kernel()
|
||||
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
|
||||
|
||||
sync_flag = bool(state.get_int64("Sync"))
|
||||
batched_flag = bool(state.get_int64("Batched"))
|
||||
|
||||
def launcher(launch: bench.Launch):
|
||||
s = as_core_Stream(launch.get_stream())
|
||||
core.launch(s, launch_config, krn, sleep_dur)
|
||||
|
||||
state.exec(launcher, sync=sync_flag, batched=batched_flag)
|
||||
|
||||
|
||||
def single_float64_axis(state: bench.State):
|
||||
# get axis value, or default
|
||||
default_sleep_dur = 3.14e-4
|
||||
sleep_dur = state.get_float64_or_default("Duration", default_sleep_dur)
|
||||
krn = make_sleep_kernel()
|
||||
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
|
||||
|
||||
def launcher(launch: bench.Launch):
|
||||
s = as_core_Stream(launch.get_stream())
|
||||
core.launch(s, launch_config, krn, sleep_dur)
|
||||
|
||||
state.exec(launcher)
|
||||
|
||||
|
||||
def default_value(state: bench.State):
|
||||
single_float64_axis(state)
|
||||
|
||||
|
||||
def make_copy_kernel(in_type: Optional[str] = None, out_type: Optional[str] = None):
|
||||
src = r"""
|
||||
#include <cuda/std/cstdint>
|
||||
#include <cuda/std/cstddef>
|
||||
/*!
|
||||
* Naive copy of `n` values from `in` -> `out`.
|
||||
*/
|
||||
template <typename T, typename U>
|
||||
__global__ void copy_kernel(const T *in, U *out, ::cuda::std::size_t n)
|
||||
{
|
||||
const auto init = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const auto step = blockDim.x * gridDim.x;
|
||||
|
||||
for (auto i = init; i < n; i += step)
|
||||
{
|
||||
out[i] = static_cast<U>(in[i]);
|
||||
}
|
||||
}
|
||||
"""
|
||||
incl = headers.get_include_paths()
|
||||
opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
|
||||
prog = core.Program(src, code_type="c++", options=opts)
|
||||
if in_type is None:
|
||||
in_type = "::cuda::std::int32_t"
|
||||
if out_type is None:
|
||||
out_type = "::cuda::std::int32_t"
|
||||
instance_name = f"copy_kernel<{in_type}, {out_type}>"
|
||||
mod = prog.compile("cubin", name_expressions=(instance_name,))
|
||||
return mod.get_kernel(instance_name)
|
||||
|
||||
|
||||
def copy_sweep_grid_shape(state: bench.State):
|
||||
block_size = state.get_int64("BlockSize")
|
||||
num_blocks = state.get_int64("NumBlocks")
|
||||
|
||||
# Number of int32 elements in 256MiB
|
||||
nbytes = 256 * 1024 * 1024
|
||||
num_values = nbytes // ctypes.sizeof(ctypes.c_int32(0))
|
||||
|
||||
state.add_element_count(num_values)
|
||||
state.add_global_memory_reads(nbytes)
|
||||
state.add_global_memory_writes(nbytes)
|
||||
|
||||
dev_id = state.get_device()
|
||||
alloc_s = as_core_Stream(state.get_stream())
|
||||
input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
|
||||
output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
|
||||
|
||||
krn = make_copy_kernel()
|
||||
launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
|
||||
|
||||
def launcher(launch: bench.Launch):
|
||||
s = as_core_Stream(launch.get_stream())
|
||||
core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
|
||||
|
||||
state.exec(launcher)
|
||||
|
||||
|
||||
def copy_type_sweep(state: bench.State):
|
||||
type_id = state.get_int64("TypeID")
|
||||
|
||||
types_map: Dict[int, Tuple[type, str]] = {
|
||||
0: (ctypes.c_uint8, "cuda::std::uint8_t"),
|
||||
1: (ctypes.c_uint16, "cuda::std::uint16_t"),
|
||||
2: (ctypes.c_uint32, "cuda::std::uint32_t"),
|
||||
3: (ctypes.c_uint64, "cuda::std::uint64_t"),
|
||||
4: (ctypes.c_float, "float"),
|
||||
5: (ctypes.c_double, "double"),
|
||||
}
|
||||
|
||||
value_ctype, value_cuda_t = types_map[type_id]
|
||||
state.add_summary("Type", value_cuda_t)
|
||||
|
||||
# Number of elements in 256MiB
|
||||
nbytes = 256 * 1024 * 1024
|
||||
num_values = nbytes // ctypes.sizeof(value_ctype)
|
||||
|
||||
state.add_element_count(num_values)
|
||||
state.add_global_memory_reads(nbytes)
|
||||
state.add_global_memory_writes(nbytes)
|
||||
|
||||
dev_id = state.get_device()
|
||||
alloc_s = as_core_Stream(state.get_stream())
|
||||
input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
|
||||
output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
|
||||
|
||||
krn = make_copy_kernel(value_cuda_t, value_cuda_t)
|
||||
launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
|
||||
|
||||
def launcher(launch: bench.Launch):
|
||||
s = as_core_Stream(launch.get_stream())
|
||||
core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
|
||||
|
||||
state.exec(launcher)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Benchmark without axes
|
||||
simple_b = bench.register(no_axes)
|
||||
simple_b.set_stopping_criterion("entropy")
|
||||
simple_b.set_criterion_param_int64("unused_int", 100)
|
||||
|
||||
tags_b = bench.register(tags)
|
||||
tags_b.add_int64_axis("Sync", [0, 1])
|
||||
tags_b.add_int64_axis("Batched", [0, 1])
|
||||
|
||||
# benchmark with no axes, that uses default value
|
||||
default_b = bench.register(default_value)
|
||||
default_b.set_min_samples(7)
|
||||
|
||||
# specify axis
|
||||
axes_b = bench.register(single_float64_axis).add_float64_axis(
|
||||
"Duration", [7e-5, 1e-4, 5e-4]
|
||||
)
|
||||
axes_b.set_timeout(20)
|
||||
axes_b.set_skip_time(1e-5)
|
||||
axes_b.set_throttle_threshold(0.2)
|
||||
axes_b.set_throttle_recovery_delay(0.1)
|
||||
|
||||
copy1_bench = bench.register(copy_sweep_grid_shape)
|
||||
copy1_bench.add_int64_power_of_two_axis("BlockSize", range(6, 10, 2))
|
||||
copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)])
|
||||
|
||||
copy2_bench = bench.register(copy_type_sweep)
|
||||
copy2_bench.add_int64_axis("TypeID", range(0, 6))
|
||||
|
||||
bench.run_all_benchmarks(sys.argv)
|
||||
39
python/test/test_nvbench.py
Normal file
39
python/test/test_nvbench.py
Normal file
@@ -0,0 +1,39 @@
|
||||
import json
|
||||
|
||||
import cuda.bench as bench
|
||||
import pytest
|
||||
|
||||
|
||||
def test_cpp_exception():
|
||||
with pytest.raises(RuntimeError, match="Test"):
|
||||
bench._nvbench.test_cpp_exception()
|
||||
|
||||
|
||||
def test_py_exception():
|
||||
with pytest.raises(bench.NVBenchRuntimeError, match="Test"):
|
||||
bench._nvbench.test_py_exception()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"cls", [bench.CudaStream, bench.State, bench.Launch, bench.Benchmark]
|
||||
)
|
||||
def test_api_ctor(cls):
|
||||
with pytest.raises(TypeError, match="No constructor defined!"):
|
||||
cls()
|
||||
|
||||
|
||||
def t_bench(state: bench.State):
|
||||
s = {"a": 1, "b": 0.5, "c": "test", "d": {"a": 1}}
|
||||
|
||||
def launcher(launch: bench.Launch):
|
||||
for _ in range(10000):
|
||||
_ = json.dumps(s)
|
||||
|
||||
state.exec(launcher)
|
||||
|
||||
|
||||
def test_cpu_only():
|
||||
b = bench.register(t_bench)
|
||||
b.set_is_cpu_only(True)
|
||||
|
||||
bench.run_all_benchmarks(["-q", "--profile"])
|
||||
Reference in New Issue
Block a user