Add scaffolding to build C++/Python docs

Add sphinx-combined folder that builds combined C++ & Python docs Fixed relative text alignment in docstrings to fix autodoc warnigns Renamed cuda.bench.test_cpp_exception and cuda.bench.test_py_exception functions to start with underscore, signaling that these functions are internal and should not be documented Account for test_cpp_exceptions -> _test_cpp_exception, same for *_py_* Fix cpp_benchmarks, add py_benchmarks 1. Fixed xrefs in docs/sphinx-combined/cpp_benchmarks.md, which is built on top of docs/benchmarks.md Added level-1 heading, and pushed existing headings one level down. 2. Added py_benchmarks.md to document benchmarking of Python scripts. 3. Rearranged entries in index.rst so that overview documents come before API enumeration. Make sure to reset __module__ of reexported symbols to be cuda.bench Enumerate free functions in nvbench:: namespace Tweak to index.rst intro sentence and title Changed title, fixed references, added intro borrowed from README Fix punctuation in one of the itemlist item text Hide TOC from the index page. It is too long and confusing
2026-04-20 06:48:53 +00:00 · 2026-03-10 16:12:55 -05:00
parent 728212f9f1
commit 6946b27528
15 changed files with 890 additions and 16 deletions
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -0,0 +1,2 @@
+sphinx-combined/_build
+sphinx-combined/_doxygen
--- a/docs/build_combined_docs.sh
+++ b/docs/build_combined_docs.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BUILD_DIR="${SCRIPT_DIR}/sphinx-combined/_build"
+DOXYGEN_DIR="${SCRIPT_DIR}/sphinx-combined/_doxygen"
+
+mkdir -p "${BUILD_DIR}" "${DOXYGEN_DIR}"
+
+echo "Running Doxygen for combined C++ API..."
+(cd "${SCRIPT_DIR}/sphinx-combined" && doxygen Doxyfile)
+
+echo "Building combined Sphinx docs..."
+sphinx-build -E -b html "${SCRIPT_DIR}/sphinx-combined" "${BUILD_DIR}"
+
+echo "Combined docs available at ${BUILD_DIR}/index.html"
--- a/docs/cli_help.md
+++ b/docs/cli_help.md
@@ -69,8 +69,7 @@

 * `--axis <axis specification>`, `-a <axis specification>`
  * Override an axis specification.
-  * See `--help-axis`
-    for [details on axis specifications](./cli_help_axis.md).
+  * See `--help-axis` for details on axis specifications.
  * Applies to the most recent `--benchmark`, or all benchmarks if specified
    before any `--benchmark` arguments.

--- a/docs/sphinx-combined/Doxyfile
+++ b/docs/sphinx-combined/Doxyfile
@@ -0,0 +1,45 @@
+PROJECT_NAME           = "NVBench"
+PROJECT_BRIEF          = "C++ NVBench Library"
+OUTPUT_DIRECTORY       = _doxygen
+GENERATE_XML           = YES
+GENERATE_HTML          = NO
+GENERATE_LATEX         = NO
+QUIET                  = YES
+WARN_IF_UNDOCUMENTED   = NO
+WARN_IF_DOC_ERROR      = YES
+WARN_LOGFILE           = _doxygen/warnings.log
+INPUT                  = ../../nvbench
+EXCLUDE                = ../../nvbench/cupti_profiler.cxx
+EXCLUDE_SYMBOLS        = type_strings \
+                         nvbench::detail \
+                         nvbench::internal \
+                         nvbench::tl \
+                         UNUSED \
+                         M_PI \
+                         NVBENCH_UNIQUE_IDENTIFIER_IMPL1 \
+                         NVBENCH_UNIQUE_IDENTIFIER_IMPL2 \
+                         main \
+                         NVBENCH_STATE_EXEC_GUARD \
+                         wrapped_type
+FILE_PATTERNS          = *.cuh *.cxx *.cu *.h *.hpp
+EXTENSION_MAPPING      = cuh=C++ cu=C++
+RECURSIVE              = YES
+EXTRACT_ALL            = YES
+EXTRACT_PRIVATE        = YES
+EXTRACT_STATIC         = YES
+JAVADOC_AUTOBRIEF      = YES
+MULTILINE_CPP_IS_BRIEF = YES
+STRIP_FROM_PATH        = ../../
+ENABLE_PREPROCESSING   = YES
+MACRO_EXPANSION        = YES
+EXPAND_ONLY_PREDEF     = NO
+GENERATE_TAGFILE       =
+XML_PROGRAMLISTING    = NO
+PREDEFINED             = __device__= \
+                         __host__= \
+                         __global__= \
+                         __forceinline__= \
+                         __shared__= \
+                         __align__(x)= \
+                         __launch_bounds__(x)= \
+                         NVBENCH_HAS_CUDA=1
--- a/docs/sphinx-combined/_static/nvidia-logo.png
+++ b/docs/sphinx-combined/_static/nvidia-logo.png
--- a/docs/sphinx-combined/cli_overview.rst
+++ b/docs/sphinx-combined/cli_overview.rst
@@ -0,0 +1,15 @@
+CLI Options
+===========
+
+Every benchmark created with NVBench supports command-line interface,
+with a variety of options.
+
+.. _cli-overview:
+
+.. include:: ../cli_help.md
+   :parser: myst_parser.sphinx_
+
+.. _cli-overview-axes:
+
+.. include:: ../cli_help_axis.md
+   :parser: myst_parser.sphinx_
--- a/docs/sphinx-combined/conf.py
+++ b/docs/sphinx-combined/conf.py
@@ -0,0 +1,104 @@
+import os
+
+project = "NVBench: CUDA Kernel Benchmarking Library"
+author = "NVIDIA Corporation"
+
+extensions = [
+    "breathe",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.autosummary",
+    "myst_parser",
+]
+
+templates_path = ["_templates"]
+exclude_patterns = ["_build", "_doxygen"]
+
+autosummary_generate = True
+autodoc_default_options = {"members": True, "undoc-members": True}
+
+release = "0.2.0"
+
+_here = os.path.abspath(os.path.dirname(__file__))
+_doxygen_xml = os.path.join(_here, "_doxygen", "xml")
+
+breathe_projects = {"nvbench": _doxygen_xml}
+breathe_default_project = "nvbench"
+breathe_domain_by_extension = {"cuh": "cpp", "cxx": "cpp", "cu": "cpp"}
+
+
+def _patch_breathe_namespace_declarations() -> None:
+    try:
+        import breathe.renderer.sphinxrenderer as sphinxrenderer
+        from docutils import nodes
+        from sphinx import addnodes
+    except Exception:
+        return
+
+    original = sphinxrenderer.SphinxRenderer.handle_declaration
+
+    def handle_declaration(self, nodeDef, declaration, *args, **kwargs):
+        is_namespace = getattr(nodeDef, "kind", None) == "namespace"
+        if not is_namespace:
+            return original(self, nodeDef, declaration, *args, **kwargs)
+
+        name = (declaration or "").strip()
+        if name.startswith("namespace "):
+            name = name[len("namespace ") :].strip()
+        if not name:
+            name = "<anonymous>"
+
+        keyword = addnodes.desc_sig_keyword("namespace", "namespace")
+        sig_name = addnodes.desc_sig_name(name, name)
+        return [keyword, nodes.Text(" "), sig_name]
+
+    sphinxrenderer.SphinxRenderer.handle_declaration = handle_declaration
+
+
+def setup(app):
+    _patch_breathe_namespace_declarations()
+
+
+######################################################
+
+# -- Options for HTML output -------------------------------------------------
+
+html_theme = "nvidia_sphinx_theme"
+
+html_logo = "_static/nvidia-logo.png"
+
+html_baseurl = (
+    os.environ.get("NVBENCH_DOCS_BASE_URL", "https://nvidia.github.io/nvbench/").rstrip(
+        "/"
+    )
+    + "/"
+)
+
+html_theme_options = {
+    "icon_links": [
+        {
+            "name": "GitHub",
+            "url": "https://github.com/NVIDIA/nvbench",
+            "icon": "fa-brands fa-github",
+            "type": "fontawesome",
+        }
+    ],
+    "navigation_depth": 4,
+    "show_toc_level": 2,
+    "navbar_start": ["navbar-logo"],
+    "navbar_end": ["theme-switcher", "navbar-icon-links"],
+    "footer_start": ["copyright"],
+    "footer_end": ["sphinx-version"],
+    "sidebar_includehidden": True,
+    "collapse_navigation": False,
+    #    "switcher": {
+    #        "json_url": f"{html_baseurl}nv-versions.json",
+    #        "version_match": release,
+    #    },
+}
+
+html_static_path = ["_static"] if os.path.exists("_static") else []
+
+# Images directory
+if os.path.exists("img"):
+    html_static_path.append("img")
--- a/docs/sphinx-combined/cpp_api.rst
+++ b/docs/sphinx-combined/cpp_api.rst
@@ -0,0 +1,40 @@
+NVBench C++ API Reference
+=========================
+
+Index
+-----
+
+.. doxygenindex::
+   :project: nvbench
+
+
+Free Functions
+--------------
+
+.. doxygenfunction:: nvbench::make_cuda_stream_view
+   :project: nvbench
+
+.. doxygenfunction:: nvbench::axis_type_to_string
+   :project: nvbench
+
+.. doxygenfunction:: nvbench::add_devices_section
+   :project: nvbench
+
+.. doxygenfunction:: nvbench::range
+   :project: nvbench
+
+.. doxygenfunction:: nvbench::sleep_kernel
+   :project: nvbench
+
+.. doxygenfunction:: nvbench::copy_kernel
+   :project: nvbench
+
+.. doxygenfunction:: nvbench::mod2_kernel
+   :project: nvbench
+
+.. doxygenfunction:: nvbench::demangle(const std::string &str)
+   :project: nvbench
+
+.. cpp:function:: template <typename T> std::string nvbench::demangle()
+
+   Returns demangled type name.
--- a/docs/sphinx-combined/cpp_benchmarks.md
+++ b/docs/sphinx-combined/cpp_benchmarks.md
@@ -0,0 +1,533 @@
+# NVBench: benchmarking in C++
+
+(minimal-benchmark)=
+## Minimal Benchmark
+
+A basic kernel benchmark can be created with just a few lines of CUDA C++:
+
+```cpp
+void my_benchmark(nvbench::state& state) {
+  state.exec([](nvbench::launch& launch) {
+    my_kernel<<<num_blocks, 256, 0, launch.get_stream()>>>();
+  });
+}
+NVBENCH_BENCH(my_benchmark);
+```
+
+There are three main components in the definition of a benchmark:
+
+- A `KernelGenerator` callable (`my_benchmark` above)
+- A `KernelLauncher` callable (the lambda passed to `nvbench::exec`), and
+- A `BenchmarkDeclaration` using `NVBENCH_BENCH` or similar macros.
+
+The `KernelGenerator` is called with an `nvbench::state` object that provides
+configuration information, as shown in later sections. The generator is
+responsible for configuring and instantiating a `KernelLauncher`, which is
+(unsurprisingly) responsible for launching a kernel. The launcher should contain
+only the minimum amount of code necessary to start the CUDA kernel,
+since `nvbench::exec` will execute it repeatedly to gather timing information.
+An `nvbench::launch` object is provided to the launcher to specify kernel
+execution details, such as the CUDA stream to use. `NVBENCH_BENCH` registers
+the benchmark with NVBench and initializes various attributes, including its
+name and parameter axes.
+
+## Benchmark Name
+
+By default, a benchmark is named by converting the first argument
+of `NVBENCH_BENCH` into a string.
+
+This can be changed to something more descriptive if desired.
+The `NVBENCH_BENCH` macro produces a customization object that allows such
+attributes to be modified.
+
+```cpp
+NVBENCH_BENCH(my_benchmark).set_name("my_kernel<<<num_blocks, 256>>>");
+```
+
+## CUDA Streams
+
+NVBench records GPU execution times on a specific CUDA stream. By default, a new
+stream is created and passed to the `KernelLauncher` via the
+`nvbench::launch::get_stream()` method, as shown in
+[Minimal Benchmark](#minimal-benchmark). All benchmarked kernels and other
+stream-ordered work must be launched on this stream for NVBench to capture it.
+
+In some instances, it may be inconvenient or impossible to specify an explicit
+CUDA stream for the benchmarked operation to use. For example, a library may
+manage and use its own streams, or an opaque API may always launch work on the
+default stream. In these situations, users may provide NVBench with an explicit
+stream via `nvbench::state::set_cuda_stream` and `nvbench::make_cuda_stream_view`.
+It is assumed that all work of interest executes on or synchronizes with this
+stream.
+
+```cpp
+void my_benchmark(nvbench::state& state) {
+  cudaStream_t default_stream = 0;
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream));
+  state.exec([](nvbench::launch&) {
+    my_func(); // a host API invoking GPU kernels on the default stream
+    my_kernel<<<num_blocks, 256>>>(); // or a kernel launched with the default stream
+  });
+}
+NVBENCH_BENCH(my_benchmark);
+```
+
+A full example can be found in [examples/stream.cu][CppExample_Stream].
+
+(parameter-axes)=
+## Parameter Axes
+
+Some kernels will be used with a variety of options, input data types/sizes, and
+other factors that impact performance. NVBench explores these different
+scenarios by sweeping through a set of user-defined parameter axes.
+
+A parameter axis defines a set of interesting values for a single kernel
+parameter — for example, the size of the input, or the type of values being
+processed. These parameter axes are used to customize a `KernelGenerator` with
+static and runtime configurations. There are four supported types of parameters:
+int64, float64, string, and type.
+
+More examples can found in [examples/axes.cu][CppExample_Axes].
+
+### Int64 Axes
+
+A common example of a parameter axis is to vary the number of input values a
+kernel should process during a benchmark measurement. An `int64_axis` is ideal
+for this:
+
+```cpp
+void benchmark(nvbench::state& state)
+{
+  const auto num_inputs = state.get_int64("NumInputs");
+  thrust::device_vector<int> data = generate_input(num_inputs);
+
+  state.exec([&data](nvbench::launch& launch) {
+    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
+  });
+}
+NVBENCH_BENCH(benchmark).add_int64_axis("NumInputs", {16, 64, 256, 1024, 4096});
+```
+
+NVBench will run the `benchmark` kernel generator once for each specified value
+in the "NumInputs" axis. The `state` object provides the current parameter value
+to `benchmark`.
+
+### Int64 Power-Of-Two Axes
+
+Using powers-of-two is quite common for these sorts of axes. `int64_axis` has a
+unique power-of-two mode that simplifies how such axes are defined and helps
+provide more readable output. A power-of-two int64 axis is defined using the
+integer exponents, but the benchmark will be run with the computed 2^N value.
+
+```cpp
+// Equivalent to above, {16, 64, 256, 1024, 4096} = {2^4, 2^6, 2^8, 2^10, 2^12}
+NVBENCH_BENCH(benchmark).add_int64_power_of_two_axis("NumInputs",
+                                                     {4, 6, 8, 10, 12});
+// Or, as shown in a later section:
+NVBENCH_BENCH(benchmark).add_int64_power_of_two_axis("NumInputs",
+                                                     nvbench::range(4, 12, 2));
+```
+
+### Float64 Axes
+
+For floating point numbers, a `float64_axis` is available:
+
+```cpp
+void benchmark(nvbench::state& state)
+{
+  const auto quality = state.get_float64("Quality");
+
+  state.exec([&quality](nvbench::launch& launch)
+  {
+    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(quality);
+  });
+}
+NVBENCH_BENCH(benchmark).add_float64_axis("Quality", {0.05, 0.1, 0.25, 0.5, 0.75, 1.});
+```
+
+### String Axes
+
+For non-numeric data, an axis of arbitrary strings provides additional
+flexibility:
+
+```cpp
+void benchmark(nvbench::state& state)
+{
+  const auto rng_dist = state.get_string("RNG Distribution");
+  thrust::device_vector<int> data = generate_input(rng_dist);
+
+  state.exec([&data](nvbench::launch& launch)
+  {
+    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
+  });
+}
+NVBENCH_BENCH(benchmark).add_string_axis("RNG Distribution", {"Uniform", "Gaussian"});
+```
+
+A common use for string axes is to encode enum values, as shown in
+[examples/enums.cu][CppExample_Enums].
+
+(type-axes)=
+### Type Axes
+
+Another common situation involves benchmarking a templated kernel with multiple
+compile-time configurations. NVBench strives to make such benchmarks as easy to
+write as possible through the use of type axes.
+
+A `type_axis` is a list of types (`T1`, `T2`, `Ts`...) wrapped in
+a `nvbench::type_list<T1, T2, Ts...>`. The kernel generator becomes a template
+function and will be instantiated using types defined by the axis. The current
+configuration's type is passed into the kernel generator using
+a `nvbench::type_list<T>`.
+
+```cpp
+template <typename T>
+void my_benchmark(nvbench::state& state, nvbench::type_list<T>)
+{
+  thrust::device_vector<T> data = generate_input<T>();
+
+  state.exec([&data](nvbench::launch& launch)
+  {
+    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
+  });
+}
+using my_types = nvbench::type_list<int, float, double>;
+NVBENCH_BENCH_TYPES(my_benchmark, NVBENCH_TYPE_AXES(my_types))
+  .set_type_axes_names({"ValueType"});
+```
+
+The `NVBENCH_TYPE_AXES` macro is unfortunately necessary to prevent commas in
+the `type_list<...>` from breaking macro parsing.
+
+Type axes can be used to encode compile-time enum and integral constants using
+the `nvbench::enum_type_list` helper. See
+[examples/enums.cu][CppExample_Enums] for detail.
+
+### Parameter sweeping
+
+Since parameter sweeps often explore a range of evenly-spaced numeric values, a
+strided range can be generated using the `nvbench::range(start, end, stride=1)`
+helper.
+
+```cpp
+assert(nvbench::range(2, 5) == {2, 3, 4, 5});
+assert(nvbench::range(2.0, 5.0) == {2.0, 3.0, 4.0, 5.0});
+assert(nvbench::range(2, 12, 2) == {2, 4, 6, 8, 10, 12});
+assert(nvbench::range(2, 12, 5) == {2, 7, 12});
+assert(nvbench::range(2, 12, 6) == {2, 8});
+assert(nvbench::range(0.0, 10.0, 2.5) == { 0.0, 2.5, 5.0, 7.5, 10.0});
+```
+
+Note that start and end are inclusive. This utility can be used to define axis
+values for all numeric axes.
+
+### Multiple Parameter Axes
+
+If more than one axis is defined, the complete cartesian product of all axes
+will be benchmarked. For example, consider a benchmark with two type axes, one
+int64 axis, and one float64 axis:
+
+```cpp
+// InputTypes: {char, int, unsigned int}
+// OutputTypes: {float, double}
+// NumInputs: {2^10, 2^20, 2^30}
+// Quality: {0.5, 1.0}
+
+using input_types = nvbench::type_list<char, int, unsigned int>;
+using output_types = nvbench::type_list<float, double>;
+NVBENCH_BENCH_TYPES(benchmark, NVBENCH_TYPE_AXES(input_types, output_types))
+  .set_type_axes_names({"InputType", "OutputType"})
+  .add_int64_power_of_two_axis("NumInputs", nvbench::range(10, 30, 10))
+  .add_float64_axis("Quality", {0.5, 1.0});
+```
+
+This would generate a total of 36 configurations and instantiate the benchmark 6
+times. Keep the rapid growth of these combinations in mind when choosing the
+number of values in an axis. See the section about combinatorial explosion for
+more examples and information.
+
+(throughput-measurements)=
+## Throughput Measurements
+
+In additional to raw timing information, NVBench can track a kernel's
+throughput, reporting the amount of data processed as:
+
+- Number of items per second
+- Number of bytes per second
+- Percentage of device's peak memory bandwidth utilized
+
+To enable throughput measurements, the kernel generator can specify the number
+of items and/or bytes handled in a single kernel execution using
+the `nvbench::state` API.
+
+```cpp
+state.add_element_count(size);
+state.add_global_memory_reads<InputType>(size);
+state.add_global_memory_writes<OutputType>(size);
+```
+
+In general::
+- Add only the input element count (no outputs).
+- Add all reads and writes to global memory.
+
+More examples can found in [examples/throughput.cu][CppExample_Throughput].
+
+(skip-uninteresting-or-invalid-benchmarks)=
+## Skip Uninteresting / Invalid Benchmarks
+
+Sometimes particular combinations of parameters aren't useful or interesting —
+or for type axes, some configurations may not even compile.
+
+The `nvbench::state` object provides a `skip("Reason")` method that can be used
+to avoid running these benchmarks. To skip uncompilable type axis
+configurations, create an overload for the kernel generator that selects for the
+invalid type combination:
+
+```cpp
+template <typename T, typename U>
+void my_benchmark(nvbench::state& state, nvbench::type_list<T, U>)
+{
+  // Skip benchmarks at runtime:
+  if (should_skip_this_config)
+  {
+    state.skip("Reason for skip.");
+    return;
+  }
+
+  /* ... */
+};
+
+// Skip benchmarks at compile time -- for example, always skip when T == U
+// (Note that the `type_list` argument defines the same type twice).
+template <typename SameType>
+void my_benchmark(nvbench::state& state,
+                  nvbench::type_list<SameType, SameType>)
+{
+  state.skip("T must not be the same type as U.");
+}
+using Ts = nvbench::type_list<...>;
+using Us = nvbench::type_list<...>;
+NVBENCH_BENCH_TYPES(my_benchmark, NVBENCH_TYPE_AXES(Ts, Us));
+```
+
+More examples can found in [examples/skip.cu][CppExample_Skip].
+
+## Execution Tags For Special Cases
+
+By default, NVBench assumes that the entire execution time of the
+`KernelLauncher` should be measured, and that no syncs are performed
+(e.g. `cudaDeviceSynchronize`, `cudaStreamSynchronize`, `cudaEventSynchronize`,
+etc. are not called).
+
+Execution tags may be passed to `state.exec` when these assumptions are not
+true:
+
+- `nvbench::exec_tag::sync` tells NVBench that the kernel launcher will
+  synchronize internally.
+- `nvbench::exec_tag::timer` requests a timer object that can be used to
+  restrict the timed region.
+- `nvbench::exec_tag::no_batch` disables batch measurements. This both disables
+  them during execution to reduce runtime, and prevents their compilation to
+  reduce compile-time and binary size.
+- `nvbench::exec_tag::gpu` is an optional hint that prevents non-GPU benchmarking
+  code from being compiled for a particular benchmark. A runtime error is emitted
+  if the benchmark is defined with `set_is_cpu_only(true)`.
+- `nvbench::exec_tag::no_gpu` is an optional hint that prevents GPU benchmarking
+  code from being compiled for a particular benchmark. A runtime error is emitted
+  if the benchmark does not also define `set_is_cpu_only(true)`.
+
+Multiple execution tags may be combined using `operator|`, e.g.
+
+```cpp
+state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+           [](nvbench::launch &launch, auto& timer) { /*...*/ });
+```
+
+The following sections provide more details on these features.
+
+### Benchmarks that sync
+
+If a `KernelLauncher` synchronizes the CUDA device internally without passing
+this tag, **the benchmark will deadlock at runtime**. Passing the `sync` tag
+will fix this issue. Note that this disables batch measurements.
+
+```cpp
+void sync_example(nvbench::state& state)
+{
+  // Pass the `sync` exec tag to tell NVBench that this benchmark will sync:
+  state.exec(nvbench::exec_tag::sync, [](nvbench::launch& launch) {
+    /* Benchmark that implicitly syncs here. */
+  });
+}
+NVBENCH_BENCH(sync_example);
+```
+
+See [examples/exec_tag_sync.cu][CppExample_ExecTagSync] for a complete
+example.
+
+(explicit-timer-mode)=
+### Explicit timer mode: `nvbench::exec_tag::timer`
+
+For some kernels, the working data may need to be reset between launches. This
+is particularly common for kernels that modify their input in-place.
+
+Resetting the input data to prepare for a new trial shouldn't be included in the
+benchmark's execution time. NVBench provides a manual timer mode that allows the
+kernel launcher to specify the critical section to be measured and exclude any
+per-trial reset operations.
+
+To enable the manual timer mode, pass the tag object `nvbench::exec_tag::timer`
+to `state.exec`, and declare the kernel launcher with an
+additional `auto& timer` argument.
+
+Note that using manual timer mode disables batch measurements.
+
+```cpp
+void timer_example(nvbench::state& state)
+{
+  // Pass the `timer` exec tag to request a timer:
+  state.exec(nvbench::exec_tag::timer,
+    // Lambda now accepts a timer:
+    [](nvbench::launch& launch, auto& timer)
+    {
+      /* Reset code here, excluded from timing */
+
+      /* Timed region is explicitly marked.
+       * The timer handles any synchronization, flushes, etc when/if
+       * needed for the current measurement.
+       */
+      timer.start();
+      /* Launch kernel on `launch.get_stream()` here */
+      timer.stop();
+    });
+}
+NVBENCH_BENCH(timer_example);
+```
+
+See [examples/exec_tag_timer.cu][CppExample_ExecTagTimer] for a complete
+example.
+
+### Compilation hints
+
+These execution tags are optional hints that disable the compilation of various
+code paths when they are not needed. They apply only to a single benchmark.
+
+- `nvbench::exec_tag::no_batch` prevents the execution and instantiation of the batch measurement backend.
+- `nvbench::exec_tag::gpu` prevents the instantiation of CPU-only benchmarking backends.
+  - Requires that the benchmark does not define `set_is_cpu_only(true)`.
+  - Optional; this has no effect on runtime measurements, but reduces compile-time and binary size.
+  - Host-side CPU measurements of GPU kernel execution time are still provided.
+- `nvbench::exec_tag::no_gpu` prevents the instantiation of GPU benchmarking backends.
+  - Requires that the benchmark defines `set_is_cpu_only(true)`.
+  - Optional; this has no effect on runtime measurements, but reduces compile-time and binary size.
+  - See also [CPU-only Benchmarks](#cpu-only-benchmarks).
+
+(cpu-only-benchmarks)=
+## CPU-only Benchmarks
+
+NVBench provides CPU-only benchmarking facilities that are intended for measuring
+significant CPU workloads. We do not recommend using these features for high-resolution
+CPU benchmarking -- other libraries (such as Google Benchmark) are more appropriate for
+such applications. Examples are provided in [examples/cpu_only.cu][CppExample_CpuOnly].
+
+Note that NVBench still requires a CUDA compiler and runtime even if a project only contains
+CPU-only benchmarks.
+
+The `is_cpu_only` property of the benchmark toggles between GPU and CPU-only measurements:
+
+```cpp
+void my_cpu_benchmark(nvbench::state &state)
+{
+  state.exec([](nvbench::launch &) { /* workload */ });
+}
+NVBENCH_BENCH(my_cpu_benchmark)
+  .set_is_cpu_only(true); // Mark as CPU-only.
+```
+
+The optional `nvbench::exec_tag::no_gpu` hint may be used to reduce tbe compilation time and
+binary size of CPU-only benchmarks. An error is emitted at runtime if this tag is used while
+`is_cpu_only` is false.
+
+```cpp
+void my_cpu_benchmark(nvbench::state &state)
+{
+  state.exec(nvbench::exec_tag::no_gpu, // Prevent compilation of GPU backends
+             [](nvbench::launch &) { /* workload */ });
+}
+NVBENCH_BENCH(my_cpu_benchmark)
+  .set_is_cpu_only(true); // Mark as CPU-only.
+```
+
+The `nvbench::exec_tag::timer` execution tag is also supported by CPU-only benchmarks. This
+is useful for benchmarks that require additional per-sample setup/teardown. See the
+[`nvbench::exec_tag::timer`](#explicit-timer-mode) section for more
+details.
+
+```cpp
+void my_cpu_benchmark(nvbench::state &state)
+{
+  state.exec(nvbench::exec_tag::no_gpu | // Prevent compilation of GPU backends
+             nvbench::exec_tag::timer,   // Request a timer object
+             [](nvbench::launch &, auto &timer)
+    {
+      // Setup here
+      timer.start();
+      // timed workload
+      timer.stop();
+      // teardown here
+    });
+}
+NVBENCH_BENCH(my_cpu_benchmark)
+  .set_is_cpu_only(true); // Mark as CPU-only.
+```
+
+## Beware of Combinatorial Explosion
+
+Be very careful of how quickly the configuration space can grow. The following
+example generates 960 total runtime benchmark configurations, and will compile
+192 different static parametrizations of the kernel generator. This is likely
+excessive, especially for routine regression testing.
+
+```cpp
+using value_types = nvbench::type_list<nvbench::uint8_t,
+                                       nvbench::int32_t,
+                                       nvbench::float32_t,
+                                       nvbench::float64_t>;
+using op_types = nvbench::type_list<thrust::plus<>,
+                                    thrust::multiplies<>,
+                                    thrust::maximum<>>;
+
+NVBENCH_BENCH_TYPES(my_benchmark,
+                    NVBENCH_TYPE_AXES(value_types,
+                                      value_types,
+                                      value_types,
+                                      op_types>))
+  .set_type_axes_names({"T", "U", "V", "Op"})
+  .add_int64_power_of_two_axis("NumInputs", nvbench::range(10, 30, 5));
+```
+
+```
+960 total configs
+= 4 [T=(U8, I32, F32, F64)]
+* 4 [U=(U8, I32, F32, F64)]
+* 4 [V=(U8, I32, F32, F64)]
+* 3 [Op=(plus, multiplies, max)]
+* 5 [NumInputs=(2^10, 2^15, 2^20, 2^25, 2^30)]
+```
+
+For large configuration spaces like this, pruning some of the less useful
+combinations (e.g. `sizeof(init_type) < sizeof(output)`) using the techniques
+described in the [Skip Uninteresting / Invalid Benchmarks](#skip-uninteresting-or-invalid-benchmarks)
+section can help immensely with keeping compile / run times manageable.
+
+Splitting a single large configuration space into multiple, more focused
+benchmarks with reduced dimensionality will likely be worth the effort as well.
+
+[CppExample_Stream]: https://github.com/NVIDIA/nvbench/blob/main/examples/stream.cu
+[CppExample_Axes]: https://github.com/NVIDIA/nvbench/blob/main/examples/axes.cu
+[CppExample_Enums]: https://github.com/NVIDIA/nvbench/blob/main/examples/enums.cu
+[CppExample_Throughput]: https://github.com/NVIDIA/nvbench/blob/main/examples/throughput.cu
+[CppExample_Skip]: https://github.com/NVIDIA/nvbench/blob/main/examples/skip.cu
+[CppExample_CpuOnly]: https://github.com/NVIDIA/nvbench/blob/main/examples/cpu_only.cu
+[CppExample_ExecTagSync]: https://github.com/NVIDIA/nvbench/blob/main/examples/exec_tag_sync.cu
+[CppExample_ExecTagTimer]: https://github.com/NVIDIA/nvbench/blob/main/examples/exec_tag_timer.cu
--- a/docs/sphinx-combined/index.rst
+++ b/docs/sphinx-combined/index.rst
@@ -0,0 +1,52 @@
+CUDA Kernel Benchmarking Library
+================================
+
+The library, NVBench, presently supports writing benchmarks in C++ and in Python.
+It is designed to simplify CUDA kernel benchmarking. It features:
+
+* :ref:`Parameter sweeps <parameter-axes>`: a powerful and
+  flexible "axis" system explores a kernel's configuration space. Parameters may
+  be dynamic numbers/strings or :ref:`static types <type-axes>`.
+* :ref:`Runtime customization <cli-overview>`: A rich command-line interface
+  allows :ref:`redefinition of parameter axes <cli-overview-axes>`, CUDA device
+  selection, locking GPU clocks (Volta+), changing output formats, and more.
+* :ref:`Throughput calculations <throughput-measurements>`: Compute
+  and report:
+
+  * Item throughput (elements/second)
+  * Global memory bandwidth usage (bytes/second and per-device %-of-peak-bw)
+
+* Multiple output formats: Currently supports markdown (default) and CSV output.
+* :ref:`Manual timer mode <explicit-timer-mode>`:
+  (optional) Explicitly start/stop timing in a benchmark implementation.
+* Multiple measurement types:
+
+  * Cold Measurements:
+
+    * Each sample runs the benchmark once with a clean device L2 cache.
+    * GPU and CPU times are reported.
+
+  * Batch Measurements:
+
+    * Executes the benchmark multiple times back-to-back and records total time.
+    * Reports the average execution time (total time / number of executions).
+
+  * :ref:`CPU-only Measurements <cpu-only-benchmarks>`:
+
+    * Measures the host-side execution time of a non-GPU benchmark.
+    * Not suitable for microbenchmarking.
+
+Check out `GPU Mode talk #56 <https://www.youtube.com/watch?v=CtrqBmYtSEki>`_ for an overview
+of the challenges inherent to CUDA kernel benchmarking and how NVBench solves them for you!
+
+-------
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+   cpp_benchmarks
+   py_benchmarks
+   cli_overview
+   cpp_api
+   python_api
--- a/docs/sphinx-combined/py_benchmarks.md
+++ b/docs/sphinx-combined/py_benchmarks.md
@@ -0,0 +1,42 @@
+# NVBench: benchmarking in Python
+
+The `cuda.bench` Python module provides Python API powered by C++ NVBench
+library to benchmark GPU-aware Python code.
+
+## Minimal benchmark
+
+```python
+from cuda.bench import State, Launch
+from cuda.bench import register, run_all_registered
+from typing import Callable
+
+from my_package import impl
+
+def benchmark_impl(state: State) -> None:
+
+    # get state parameters
+    n = state.get_int64("Elements")
+
+    # prepare inputs
+    data = generate(n, state.get_stream())
+
+    # body that is being timed. Must execute
+    # on the stream handed over by NVBench.
+    # Typically launches a kernel of interest
+    launch_fn : Callable[[Launch], None] =
+       lambda launch: impl(data, launch.get_stream())
+
+    state.exec(launch_fn)
+
+
+bench = register(benchmark_impl)
+# provide kernel a name
+bench.set_name("my_package_kernel")
+# specify default values of parameter to run benchmark with
+bench.add_int64_axis("Elements", [1000, 10000, 100000])
+
+
+if __name__ == "__main__":
+   import sys
+   run_all_registered(sys.argv)
+```
--- a/docs/sphinx-combined/python_api.rst
+++ b/docs/sphinx-combined/python_api.rst
@@ -0,0 +1,17 @@
+`cuda.bench` Python API Reference
+=================================
+
+Python package ``cuda.bench`` is designed to empower
+users to write CUDA kernel benchmarks in Python.
+
+Alignment with behavior of benchmarks written in C++
+allows for meaningful comparison between them.
+
+Classes and functions
+---------------------
+
+.. automodule:: cuda.bench
+   :members:
+   :imported-members:
+   :undoc-members:
+   :show-inheritance:
--- a/python/cuda/bench/init.py
+++ b/python/cuda/bench/init.py
@@ -69,12 +69,21 @@ NVBenchRuntimeError = _nvbench_module.NVBenchRuntimeError
 State = _nvbench_module.State
 register = _nvbench_module.register
 run_all_benchmarks = _nvbench_module.run_all_benchmarks
-test_cpp_exception = _nvbench_module.test_cpp_exception
-test_py_exception = _nvbench_module.test_py_exception
+_test_cpp_exception = _nvbench_module._test_cpp_exception
+_test_py_exception = _nvbench_module._test_py_exception

 # Expose the module as _nvbench for backward compatibility (e.g., for tests)
 _nvbench = _nvbench_module

+# Set module of exposed objects
+Benchmark.__module__ = __name__
+CudaStream.__module__ = __name__
+Launch.__module__ = __name__
+NVBenchRuntimeError.__module__ = __name__
+State.__module__ = __name__
+register.__module__ = __name__
+run_all_benchmarks.__module__ = __name__
+
 # Clean up internal symbols
 del (
    _nvbench_module,
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -273,7 +273,7 @@ static void def_class_CudaStream(py::module_ m)
  //       nvbench::cuda_stream::get_stream

  static constexpr const char *class_CudaStream_doc = R"XXX(
-Represents CUDA stream
+    Represents CUDA stream

    Note
    ----
@@ -321,7 +321,7 @@ void def_class_Launch(py::module_ m)
  //        nvbench::launch::get_stream -> nvbench::cuda_stream

  static constexpr const char *class_Launch_doc = R"XXXX(
-Configuration object for function launch.
+    Configuration object for function launch.

    Note
    ----
@@ -363,13 +363,13 @@ static void def_class_Benchmark(py::module_ m)
  //        nvbench::benchmark_base::set_min_samples

  static constexpr const char *class_Benchmark_doc = R"XXXX(
-Represents NVBench benchmark.
+    Represents NVBench benchmark.

    Note
    ----
        The class is not user-constructible.

-        Use `~register` function to create Benchmark and register
+        Use `register` function to create Benchmark and register
        it with NVBench.
 )XXXX";
  auto py_benchmark_cls = py::class_<nvbench::benchmark_base>(m, "Benchmark", class_Benchmark_doc);
@@ -691,7 +691,7 @@ void def_class_State(py::module_ m)

  using state_ref_t                            = std::reference_wrapper<nvbench::state>;
  static constexpr const char *class_State_doc = R"XXXX(
-Represent benchmark configuration state.
+    Represents benchmark configuration state.

    Note
    ----
@@ -736,7 +736,7 @@ Get device_id of the device from this configuration
    return std::ref(state.get_cuda_stream());
  };
  static constexpr const char *method_get_stream_doc = R"XXXX(
-Get `~CudaStream` object from this configuration"
+Get `CudaStream` object from this configuration
 )XXXX";
  pystate_cls.def("get_stream",
                  method_get_stream_impl,
@@ -1014,10 +1014,10 @@ Use argument True to disable use of blocking kernel by NVBench"
    }
  };
  static constexpr const char *method_exec_doc = R"XXXX(
-Execute callable running the benchmark.
+    Execute callable running the benchmark.

    The callable may be executed multiple times. The callable
-    will be passed `~Launch` object argument.
+    will be passed `Launch` object argument.

    Parameters
    ----------
@@ -1194,8 +1194,8 @@ Register benchmark function of type Callable[[nvbench.State], None]
        py::arg("argv") = py::list());

  // Testing utilities
-  m.def("test_cpp_exception", []() { throw nvbench_run_error("Test"); });
-  m.def("test_py_exception", []() {
+  m.def("_test_cpp_exception", []() { throw nvbench_run_error("Test"); });
+  m.def("_test_py_exception", []() {
    py::set_error(exc_storage.get_stored(), "Test");
    throw py::error_already_set();
  });
--- a/python/test/test_cuda_bench.py
+++ b/python/test/test_cuda_bench.py
@@ -6,12 +6,12 @@ import pytest

 def test_cpp_exception():
    with pytest.raises(RuntimeError, match="Test"):
-        bench._nvbench.test_cpp_exception()
+        bench._nvbench._test_cpp_exception()


 def test_py_exception():
    with pytest.raises(bench.NVBenchRuntimeError, match="Test"):
-        bench._nvbench.test_py_exception()
+        bench._nvbench._test_py_exception()


@pytest.mark.parametrize(