From d5d188eed401055dba04e2d0289c7bab70be35c6 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 13 Mar 2026 10:58:54 -0500
Subject: [PATCH] Changed title, fixed references, added intro borrowed from
 README

---
 docs/sphinx-combined/cli_overview.rst  |  3 ++
 docs/sphinx-combined/conf.py           |  2 +-
 docs/sphinx-combined/cpp_benchmarks.md |  3 ++
 docs/sphinx-combined/index.rst         | 44 ++++++++++++++++++++++++--
 docs/sphinx-combined/py_benchmarks.md  | 10 ++++--
 docs/sphinx-combined/python_api.rst    | 13 ++++++--
 6 files changed, 66 insertions(+), 9 deletions(-)
diff --git a/docs/sphinx-combined/cli_overview.rst b/docs/sphinx-combined/cli_overview.rst
index 6d01ed2..6d94eb0 100644
--- a/docs/sphinx-combined/cli_overview.rst
+++ b/docs/sphinx-combined/cli_overview.rst
@@ -4,9 +4,12 @@ CLI Options
 Every benchmark created with NVBench supports command-line interface,
 with a variety of options.
 
+.. _cli-overview:
+
 .. include:: ../cli_help.md
    :parser: myst_parser.sphinx_
 
+.. _cli-overview-axes:
 
 .. include:: ../cli_help_axis.md
    :parser: myst_parser.sphinx_
diff --git a/docs/sphinx-combined/conf.py b/docs/sphinx-combined/conf.py
index 26cc004..5f64edb 100644
--- a/docs/sphinx-combined/conf.py
+++ b/docs/sphinx-combined/conf.py
@@ -1,6 +1,6 @@
 import os
 
-project = "NVBench API"
+project = "NVBench: CUDA Kernel Benchmarking Library"
 author = "NVIDIA Corporation"
 
 extensions = [
diff --git a/docs/sphinx-combined/cpp_benchmarks.md b/docs/sphinx-combined/cpp_benchmarks.md
index 46c6125..111a149 100644
--- a/docs/sphinx-combined/cpp_benchmarks.md
+++ b/docs/sphinx-combined/cpp_benchmarks.md
@@ -74,6 +74,7 @@ NVBENCH_BENCH(my_benchmark);
 
 A full example can be found in [examples/stream.cu][CppExample_Stream].
 
+(parameter-axes)=
 ## Parameter Axes
 
 Some kernels will be used with a variety of options, input data types/sizes, and
@@ -166,6 +167,7 @@ NVBENCH_BENCH(benchmark).add_string_axis("RNG Distribution", {"Uniform", "Gaussi
 A common use for string axes is to encode enum values, as shown in
 [examples/enums.cu][CppExample_Enums].
 
+(type-axes)=
 ### Type Axes
 
 Another common situation involves benchmarking a templated kernel with multiple
@@ -244,6 +246,7 @@ times. Keep the rapid growth of these combinations in mind when choosing the
 number of values in an axis. See the section about combinatorial explosion for
 more examples and information.
 
+(throughput-measurements)=
 ## Throughput Measurements
 
 In additional to raw timing information, NVBench can track a kernel's
diff --git a/docs/sphinx-combined/index.rst b/docs/sphinx-combined/index.rst
index e48acbc..8c66e6d 100644
--- a/docs/sphinx-combined/index.rst
+++ b/docs/sphinx-combined/index.rst
@@ -1,7 +1,45 @@
-NVBench: CUDA Kernel Benchmarking Library
-=========================================
+CUDA Kernel Benchmarking Library
+================================
 
-The library presently supports kernel benchmarking in C++ and in Python.
+The library, NVBench, presently supports writing benchmarks in C++ and in Python.
+It is designed to simplify CUDA kernel benchmarking. It features:
+
+* :ref:`Parameter sweeps <parameter-axes>`: a powerful and
+  flexible "axis" system explores a kernel's configuration space. Parameters may
+  be dynamic numbers/strings or :ref:`static types <type-axes>`.
+* :ref:`Runtime customization <cli-overview>`: A rich command-line interface
+  allows :ref:`redefinition of parameter axes <cli-overview-axes>`, CUDA device
+  selection, locking GPU clocks (Volta+), changing output formats, and more.
+* :ref:`Throughput calculations <throughput-measurements>`: Compute
+  and report:
+
+  * Item throughput (elements/second)
+  * Global memory bandwidth usage (bytes/second and per-device %-of-peak-bw)
+
+* Multiple output formats: Currently supports markdown (default) and CSV output.
+* :ref:`Manual timer mode <explicit-timer-mode>`:
+  (optional) Explicitly start/stop timing in a benchmark implementation.
+* Multiple measurement types:
+
+  * Cold Measurements:
+
+    * Each sample runs the benchmark once with a clean device L2 cache.
+    * GPU and CPU times are reported.
+
+  * Batch Measurements:
+
+    * Executes the benchmark multiple times back-to-back and records total time.
+    * Reports the average execution time (total time / number of executions).
+
+  * :ref:`CPU-only Measurements <cpu-only-benchmarks>`)
+
+    * Measures the host-side execution time of a non-GPU benchmark.
+    * Not suitable for microbenchmarking.
+
+Check out `GPU Mode talk #56 <https://www.youtube.com/watch?v=CtrqBmYtSEki>`_ for an overview
+of the challenges inherent to CUDA kernel benchmarking and how NVBench solves them for you!
+
+-------
 
 .. toctree::
    :maxdepth: 2
diff --git a/docs/sphinx-combined/py_benchmarks.md b/docs/sphinx-combined/py_benchmarks.md
index 788d4e7..3eb086d 100644
--- a/docs/sphinx-combined/py_benchmarks.md
+++ b/docs/sphinx-combined/py_benchmarks.md
@@ -21,14 +21,18 @@ def benchmark_impl(state: State) -> None:
     data = generate(n, state.get_stream())
 
     # body that is being timed. Must execute
-    # on the stream handed over by NVBench
-    launchable_fn : Callable[[Launch], None] =
+    # on the stream handed over by NVBench.
+    # Typically launches a kernel of interest
+    launch_fn : Callable[[Launch], None] =
        lambda launch: impl(data, launch.get_stream())
 
-    state.exec(launchable_fn)
+    state.exec(launch_fn)
 
 
 bench = register(benchmark_impl)
+# provide kernel a name
+bench.set_name("my_package_kernel")
+# specify default values of parameter to run benchmark with
 bench.add_int64_axis("Elements", [1000, 10000, 100000])
 
 
diff --git a/docs/sphinx-combined/python_api.rst b/docs/sphinx-combined/python_api.rst
index 22e24ef..1eb618f 100644
--- a/docs/sphinx-combined/python_api.rst
+++ b/docs/sphinx-combined/python_api.rst
@@ -1,5 +1,14 @@
-cuda.bench Python API Reference
-===============================
+`cuda.bench` Python API Reference
+=================================
+
+Python package ``cuda.bench`` is designed to empower
+users to write CUDA kernel benchmarks in Python.
+
+Alignment with behavior of benchmarks written in C++
+allows for meaningful comparison between them.
+
+Classes and functions
+---------------------
 
 .. automodule:: cuda.bench
    :members: