From d5d188eed401055dba04e2d0289c7bab70be35c6 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Fri, 13 Mar 2026 10:58:54 -0500 Subject: [PATCH] Changed title, fixed references, added intro borrowed from README --- docs/sphinx-combined/cli_overview.rst | 3 ++ docs/sphinx-combined/conf.py | 2 +- docs/sphinx-combined/cpp_benchmarks.md | 3 ++ docs/sphinx-combined/index.rst | 44 ++++++++++++++++++++++++-- docs/sphinx-combined/py_benchmarks.md | 10 ++++-- docs/sphinx-combined/python_api.rst | 13 ++++++-- 6 files changed, 66 insertions(+), 9 deletions(-) diff --git a/docs/sphinx-combined/cli_overview.rst b/docs/sphinx-combined/cli_overview.rst index 6d01ed2..6d94eb0 100644 --- a/docs/sphinx-combined/cli_overview.rst +++ b/docs/sphinx-combined/cli_overview.rst @@ -4,9 +4,12 @@ CLI Options Every benchmark created with NVBench supports command-line interface, with a variety of options. +.. _cli-overview: + .. include:: ../cli_help.md :parser: myst_parser.sphinx_ +.. _cli-overview-axes: .. include:: ../cli_help_axis.md :parser: myst_parser.sphinx_ diff --git a/docs/sphinx-combined/conf.py b/docs/sphinx-combined/conf.py index 26cc004..5f64edb 100644 --- a/docs/sphinx-combined/conf.py +++ b/docs/sphinx-combined/conf.py @@ -1,6 +1,6 @@ import os -project = "NVBench API" +project = "NVBench: CUDA Kernel Benchmarking Library" author = "NVIDIA Corporation" extensions = [ diff --git a/docs/sphinx-combined/cpp_benchmarks.md b/docs/sphinx-combined/cpp_benchmarks.md index 46c6125..111a149 100644 --- a/docs/sphinx-combined/cpp_benchmarks.md +++ b/docs/sphinx-combined/cpp_benchmarks.md @@ -74,6 +74,7 @@ NVBENCH_BENCH(my_benchmark); A full example can be found in [examples/stream.cu][CppExample_Stream]. +(parameter-axes)= ## Parameter Axes Some kernels will be used with a variety of options, input data types/sizes, and @@ -166,6 +167,7 @@ NVBENCH_BENCH(benchmark).add_string_axis("RNG Distribution", {"Uniform", "Gaussi A common use for string axes is to encode enum values, as shown in [examples/enums.cu][CppExample_Enums]. +(type-axes)= ### Type Axes Another common situation involves benchmarking a templated kernel with multiple @@ -244,6 +246,7 @@ times. Keep the rapid growth of these combinations in mind when choosing the number of values in an axis. See the section about combinatorial explosion for more examples and information. +(throughput-measurements)= ## Throughput Measurements In additional to raw timing information, NVBench can track a kernel's diff --git a/docs/sphinx-combined/index.rst b/docs/sphinx-combined/index.rst index e48acbc..8c66e6d 100644 --- a/docs/sphinx-combined/index.rst +++ b/docs/sphinx-combined/index.rst @@ -1,7 +1,45 @@ -NVBench: CUDA Kernel Benchmarking Library -========================================= +CUDA Kernel Benchmarking Library +================================ -The library presently supports kernel benchmarking in C++ and in Python. +The library, NVBench, presently supports writing benchmarks in C++ and in Python. +It is designed to simplify CUDA kernel benchmarking. It features: + +* :ref:`Parameter sweeps `: a powerful and + flexible "axis" system explores a kernel's configuration space. Parameters may + be dynamic numbers/strings or :ref:`static types `. +* :ref:`Runtime customization `: A rich command-line interface + allows :ref:`redefinition of parameter axes `, CUDA device + selection, locking GPU clocks (Volta+), changing output formats, and more. +* :ref:`Throughput calculations `: Compute + and report: + + * Item throughput (elements/second) + * Global memory bandwidth usage (bytes/second and per-device %-of-peak-bw) + +* Multiple output formats: Currently supports markdown (default) and CSV output. +* :ref:`Manual timer mode `: + (optional) Explicitly start/stop timing in a benchmark implementation. +* Multiple measurement types: + + * Cold Measurements: + + * Each sample runs the benchmark once with a clean device L2 cache. + * GPU and CPU times are reported. + + * Batch Measurements: + + * Executes the benchmark multiple times back-to-back and records total time. + * Reports the average execution time (total time / number of executions). + + * :ref:`CPU-only Measurements `) + + * Measures the host-side execution time of a non-GPU benchmark. + * Not suitable for microbenchmarking. + +Check out `GPU Mode talk #56 `_ for an overview +of the challenges inherent to CUDA kernel benchmarking and how NVBench solves them for you! + +------- .. toctree:: :maxdepth: 2 diff --git a/docs/sphinx-combined/py_benchmarks.md b/docs/sphinx-combined/py_benchmarks.md index 788d4e7..3eb086d 100644 --- a/docs/sphinx-combined/py_benchmarks.md +++ b/docs/sphinx-combined/py_benchmarks.md @@ -21,14 +21,18 @@ def benchmark_impl(state: State) -> None: data = generate(n, state.get_stream()) # body that is being timed. Must execute - # on the stream handed over by NVBench - launchable_fn : Callable[[Launch], None] = + # on the stream handed over by NVBench. + # Typically launches a kernel of interest + launch_fn : Callable[[Launch], None] = lambda launch: impl(data, launch.get_stream()) - state.exec(launchable_fn) + state.exec(launch_fn) bench = register(benchmark_impl) +# provide kernel a name +bench.set_name("my_package_kernel") +# specify default values of parameter to run benchmark with bench.add_int64_axis("Elements", [1000, 10000, 100000]) diff --git a/docs/sphinx-combined/python_api.rst b/docs/sphinx-combined/python_api.rst index 22e24ef..1eb618f 100644 --- a/docs/sphinx-combined/python_api.rst +++ b/docs/sphinx-combined/python_api.rst @@ -1,5 +1,14 @@ -cuda.bench Python API Reference -=============================== +`cuda.bench` Python API Reference +================================= + +Python package ``cuda.bench`` is designed to empower +users to write CUDA kernel benchmarks in Python. + +Alignment with behavior of benchmarks written in C++ +allows for meaningful comparison between them. + +Classes and functions +--------------------- .. automodule:: cuda.bench :members: