Merge branch 'main' into remove-cupti-python

2026-05-11 00:40:00 +00:00 · 2026-02-03 08:58:41 -06:00
parent 4fa4296810 867d5d4276
commit d75fc74162
11 changed files with 870 additions and 557 deletions
--- a/docs/cli_help.md
+++ b/docs/cli_help.md
@@ -115,6 +115,12 @@
  * Applies to the most recent `--benchmark`, or all benchmarks if specified
    before any `--benchmark` arguments.

+* `--no-batch`
+  * Do not run batched measurements even if enabled.
+  * Intended to shorten run-time when batched measurements are not of interest.
+  * Applied to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
 ## Stopping Criteria

 * `--timeout <seconds>`
--- a/nvbench/benchmark_base.cuh
+++ b/nvbench/benchmark_base.cuh
@@ -183,6 +183,18 @@ struct benchmark_base
  }
  /// @}

+  /// If true, the batched measurements for benchmark are not run. This is intended for use to
+  /// save resources when only non-batched measurements are of interest, although batched
+  /// measurements are meaningful and code to exercise them is compiled. This option has no
+  /// effect for CPU only benchmarks and for benchmarks tagged with no_batch tag. @{
+  [[nodiscard]] bool get_skip_batched() const { return m_skip_batched; }
+  benchmark_base &set_skip_batched(bool v)
+  {
+    m_skip_batched = v;
+    return *this;
+  }
+  /// @}
+
  /// If true, the benchmark does not use the blocking_kernel. This is intended
  /// for use with external profiling tools. @{
  [[nodiscard]] bool get_disable_blocking_kernel() const { return m_disable_blocking_kernel; }
@@ -304,6 +316,7 @@ protected:
  bool m_is_cpu_only{false};
  bool m_run_once{false};
  bool m_disable_blocking_kernel{false};
+  bool m_skip_batched{false};

  nvbench::int64_t m_min_samples{10};

--- a/nvbench/detail/state_exec.cuh
+++ b/nvbench/detail/state_exec.cuh
@@ -185,7 +185,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
      static_assert(!(tags & no_batch), "Hot measurement doesn't support the `no_batch` exec_tag.");
      static_assert(!(tags & no_gpu), "Hot measurement doesn't support the `no_gpu` exec_tag.");

-      if (!this->get_run_once())
+      if (!this->skip_hot_measurement())
      {
        using measure_t = nvbench::detail::measure_hot<KL>;
        measure_t measure{*this, kernel_launcher};
--- a/nvbench/option_parser.cu
+++ b/nvbench/option_parser.cu
@@ -467,6 +467,11 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
      this->enable_profile();
      first += 1;
    }
+    else if (arg == "--no-batch")
+    {
+      this->disable_batched();
+      first += 1;
+    }
    else if (arg == "--quiet" || arg == "-q")
    {
      // Setting this flag prevents the default stdout printer from being
@@ -762,6 +767,18 @@ void option_parser::enable_profile()
  bench.set_run_once(true);
 }

+void option_parser::disable_batched()
+{
+  // If no active benchmark, save args as global
+  if (m_benchmarks.empty())
+  {
+    m_global_benchmark_args.push_back("--no-batch");
+    return;
+  }
+  benchmark_base &bench = *m_benchmarks.back();
+  bench.set_skip_batched(true);
+}
+
 void option_parser::add_benchmark(const std::string &name)
 try
 {
--- a/nvbench/option_parser.cuh
+++ b/nvbench/option_parser.cuh
@@ -90,6 +90,7 @@ private:
  void set_stopping_criterion(const std::string &criterion);

  void enable_profile();
+  void disable_batched();

  void add_benchmark(const std::string &name);
  void replay_global_args();
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -161,6 +161,14 @@ struct state
  void set_run_once(bool v) { m_run_once = v; }
  /// @}

+  /// If true, the batched measurements of benchmark are not run. This is intended for use to
+  /// save resources when only non-batched measurements are of interest, although batched
+  /// measurements are meaningful and code to exercise them is compiled. This option has no
+  /// effect for CPU only benchmarks and for benchmarks tagged with no_batch tag. @{
+  [[nodiscard]] bool get_skip_batched() const { return m_skip_batched; }
+  void set_skip_batched(bool v) { m_skip_batched = v; }
+  /// @}
+
  /// If true, the benchmark does not use the blocking_kernel. This is intended
  /// for use with external profiling tools. @{
  [[nodiscard]] bool get_disable_blocking_kernel() const { return m_disable_blocking_kernel; }
@@ -298,6 +306,8 @@ private:
        std::optional<nvbench::device_info> device,
        std::size_t type_config_index);

+  [[nodiscard]] bool skip_hot_measurement() const { return get_run_once() || get_skip_batched(); }
+
  std::reference_wrapper<const nvbench::benchmark_base> m_benchmark;
  nvbench::named_values m_axis_values;
  std::optional<nvbench::device_info> m_device;
@@ -306,6 +316,7 @@ private:
  bool m_is_cpu_only{false};
  bool m_run_once{false};
  bool m_disable_blocking_kernel{false};
+  bool m_skip_batched{false};

  nvbench::criterion_params m_criterion_params;
  std::string m_stopping_criterion;
--- a/nvbench/state.cxx
+++ b/nvbench/state.cxx
@@ -57,6 +57,7 @@ state::state(const benchmark_base &bench,
    , m_is_cpu_only(bench.get_is_cpu_only())
    , m_run_once{bench.get_run_once()}
    , m_disable_blocking_kernel{bench.get_disable_blocking_kernel()}
+    , m_skip_batched{bench.get_skip_batched()}
    , m_criterion_params{bench.get_criterion_params()}
    , m_stopping_criterion(bench.get_stopping_criterion())
    , m_min_samples{bench.get_min_samples()}
--- a/python/cuda/bench/init.py
+++ b/python/cuda/bench/init.py
@@ -83,3 +83,7 @@ del (
    _module_fullname,
    _get_cuda_major_version,
 )
+
+__doc__ = """
+CUDA Kernel Benchmarking Library Python API
+"""
--- a/python/cuda/bench/init.pyi
+++ b/python/cuda/bench/init.pyi
@@ -29,217 +29,73 @@ from collections.abc import Callable, Sequence
 from typing import Optional, Self, SupportsFloat, SupportsInt, Union

 class CudaStream:
-    """Represents CUDA stream
-
-    Note
-    ----
-        The class is not user-constructible.
-    """
-    def __cuda_stream__(self) -> tuple[int, int]:
-        """
-        Special method implement CUDA stream protocol
-        from `cuda.core`. Returns a pair of integers:
-        (protocol_version, integral_value_of_cudaStream_t pointer)
-
-        Example
-        -------
-            import cuda.core.experimental as core
-            import cuda.bench as bench
-
-            def bench(state: bench.State):
-                dev = core.Device(state.get_device())
-                dev.set_current()
-                # converts CudaString to core.Stream
-                # using __cuda_stream__ protocol
-                dev.create_stream(state.get_stream())
-        """
-        ...
-
-    def addressof(self) -> int:
-        "Integral value of address of driver's CUDA stream struct"
-        ...
+    def __cuda_stream__(self) -> tuple[int, int]: ...
+    def addressof(self) -> int: ...

 class Benchmark:
-    """Represents NVBench benchmark.
-
-    Note
-    ----
-        The class is not user-constructible.
-
-        Use `~register` function to create Benchmark and register
-        it with NVBench.
-    """
-    def get_name(self) -> str:
-        "Get benchmark name"
-        ...
-    def add_int64_axis(self, name: str, values: Sequence[SupportsInt]) -> Self:
-        "Add integral type parameter axis with given name and values to sweep over"
-        ...
+    def get_name(self) -> str: ...
+    def add_int64_axis(self, name: str, values: Sequence[SupportsInt]) -> Self: ...
    def add_int64_power_of_two_axis(
        self, name: str, values: Sequence[SupportsInt]
-    ) -> Self:
-        "Add integral type parameter axis with given name and values to sweep over"
-        ...
-    def add_float64_axis(self, name: str, values: Sequence[SupportsFloat]) -> Self:
-        "Add floating-point type parameter axis with given name and values to sweep over"
-        ...
-    def add_string_axis(self, name: str, values: Sequence[str]) -> Self:
-        "Add string type parameter axis with given name and values to sweep over"
-        ...
-    def set_name(self, name: str) -> Self:
-        "Set benchmark name"
-        ...
-    def set_is_cpu_only(self, is_cpu_only: bool) -> Self:
-        "Set whether this benchmark only executes on CPU"
-        ...
-    def set_run_once(self, v: bool) -> Self:
-        "Set whether all benchmark configurations are executed only once"
-        ...
-    def set_skip_time(self, duration_seconds: SupportsFloat) -> Self:
-        "Set run durations, in seconds, that should be skipped"
-        ...
-    def set_throttle_recovery_delay(self, delay_seconds: SupportsFloat) -> Self:
-        "Set throttle recovery delay, in seconds"
-        ...
-    def set_throttle_threshold(self, threshold: SupportsFloat) -> Self:
-        "Set throttle threshold, as a fraction of maximal GPU frequency"
-        ...
-    def set_timeout(self, duration_seconds: SupportsFloat) -> Self:
-        "Set benchmark run duration timeout value, in seconds"
-        ...
-    def set_stopping_criterion(self, criterion: str) -> Self:
-        "Set stopping criterion to be used"
-        ...
-    def set_criterion_param_float64(self, name: str, value: SupportsFloat) -> Self:
-        "Set stopping criterion floating point parameter value"
-        ...
-    def set_criterion_param_int64(self, name: str, value: SupportsInt) -> Self:
-        "Set stopping criterion integer parameter value"
-        ...
-    def set_criterion_param_string(self, name: str, value: str) -> Self:
-        "Set stopping criterion string parameter value"
-        ...
-    def set_min_samples(self, count: SupportsInt) -> Self:
-        "Set minimal samples count before stopping criterion applies"
-        ...
+    ) -> Self: ...
+    def add_float64_axis(self, name: str, values: Sequence[SupportsFloat]) -> Self: ...
+    def add_string_axis(self, name: str, values: Sequence[str]) -> Self: ...
+    def set_name(self, name: str) -> Self: ...
+    def set_run_once(self, v: bool) -> Self: ...
+    def set_skip_time(self, duration_seconds: SupportsFloat) -> Self: ...
+    def set_throttle_recovery_delay(self, delay_seconds: SupportsFloat) -> Self: ...
+    def set_throttle_threshold(self, threshold: SupportsFloat) -> Self: ...
+    def set_timeout(self, duration_seconds: SupportsFloat) -> Self: ...
+    def set_stopping_criterion(self, criterion: str) -> Self: ...
+    def set_criterion_param_float64(self, name: str, value: SupportsFloat) -> Self: ...
+    def set_criterion_param_int64(self, name: str, value: SupportsInt) -> Self: ...
+    def set_criterion_param_string(self, name: str, value: str) -> Self: ...
+    def set_min_samples(self, count: SupportsInt) -> Self: ...
+    def set_is_cpu_only(self, is_cpu_only: bool) -> Self: ...

 class Launch:
-    """Configuration object for function launch.
-
-    Note
-    ----
-        The class is not user-constructible.
-    """
-    def get_stream(self) -> CudaStream:
-        "Get CUDA stream of this configuration"
-        ...
+    def get_stream(self) -> CudaStream: ...

 class State:
-    """Represent benchmark configuration state.
-
-    Note
-    ----
-        The class is not user-constructible.
-    """
-    def has_device(self) -> bool:
-        "True if configuration has a device"
-        ...
-    def has_printers(self) -> bool:
-        "True if configuration has a printer"
-        ...
-    def get_device(self) -> Union[int, None]:
-        "Get device_id of the device from this configuration"
-        ...
-    def get_stream(self) -> CudaStream:
-        "CudaStream object from this configuration"
-        ...
-    def get_int64(self, name: str) -> int:
-        "Get value for given Int64 axis from this configuration"
-        ...
-    def get_int64_or_default(self, name: str, default_value: SupportsInt) -> int:
-        "Get value for given Int64 axis from this configuration"
-        ...
-    def get_float64(self, name: str) -> float:
-        "Get value for given Float64 axis from this configuration"
-        ...
-    def get_float64_or_default(self, name: str, default_value: SupportsFloat) -> float:
-        "Get value for given Float64 axis from this configuration"
-        ...
-    def get_string(self, name: str) -> str:
-        "Get value for given String axis from this configuration"
-        ...
-    def get_string_or_default(self, name: str, default_value: str) -> str:
-        "Get value for given String axis from this configuration"
-        ...
+    def has_device(self) -> bool: ...
+    def has_printers(self) -> bool: ...
+    def get_device(self) -> Union[int, None]: ...
+    def get_stream(self) -> CudaStream: ...
+    def get_int64(self, name: str) -> int: ...
+    def get_int64_or_default(self, name: str, default_value: SupportsInt) -> int: ...
+    def get_float64(self, name: str) -> float: ...
+    def get_float64_or_default(
+        self, name: str, default_value: SupportsFloat
+    ) -> float: ...
+    def get_string(self, name: str) -> str: ...
+    def get_string_or_default(self, name: str, default_value: str) -> str: ...
    def add_element_count(
        self, count: SupportsInt, column_name: Optional[str] = None
-    ) -> None:
-        "Add element count"
-        ...
-    def set_element_count(self, count: SupportsInt) -> None:
-        "Set element count"
-        ...
-    def get_element_count(self) -> int:
-        "Get element count"
-        ...
-    def skip(self, reason: str) -> None:
-        "Skip this configuration"
-        ...
-    def is_skipped(self) -> bool:
-        "Has this configuration been skipped"
-        ...
-    def get_skip_reason(self) -> str:
-        "Get reason provided for skipping this configuration"
-        ...
+    ) -> None: ...
+    def set_element_count(self, count: SupportsInt) -> None: ...
+    def get_element_count(self) -> int: ...
+    def skip(self, reason: str) -> None: ...
+    def is_skipped(self) -> bool: ...
+    def get_skip_reason(self) -> str: ...
    def add_global_memory_reads(
        self, nbytes: SupportsInt, /, column_name: str = ""
-    ) -> None:
-        "Inform NVBench that given amount of bytes is being read by the benchmark from global memory"
-        ...
+    ) -> None: ...
    def add_global_memory_writes(
        self, nbytes: SupportsInt, /, column_name: str = ""
-    ) -> None:
-        "Inform NVBench that given amount of bytes is being written by the benchmark into global memory"
-        ...
-    def get_benchmark(self) -> Benchmark:
-        "Get Benchmark this configuration is a part of"
-        ...
-    def get_throttle_threshold(self) -> float:
-        "Get throttle threshold value, as fraction of maximal frequency"
-        ...
-    def set_throttle_threshold(self, threshold_fraction: SupportsFloat) -> None:
-        "Set throttle threshold fraction to specified value, expected to be between 0 and 1"
-        ...
-    def get_min_samples(self) -> int:
-        "Get the number of benchmark timings NVBench performs before stopping criterion begins being used"
-        ...
-    def set_min_samples(self, min_samples_count: SupportsInt) -> None:
-        "Set the number of benchmark timings for NVBench to perform before stopping criterion begins being used"
-        ...
-    def get_disable_blocking_kernel(self) -> bool:
-        "True if use of blocking kernel by NVBench is disabled, False otherwise"
-        ...
-    def set_disable_blocking_kernel(self, flag: bool) -> None:
-        "Use flag = True to disable use of blocking kernel by NVBench"
-        ...
-    def get_run_once(self) -> bool:
-        "Boolean flag whether configuration should only run once"
-        ...
-    def set_run_once(self, run_once_flag: bool) -> None:
-        "Set run-once flag for this configuration"
-        ...
-    def get_timeout(self) -> float:
-        "Get time-out value for benchmark execution of this configuration, in seconds"
-        ...
-    def set_timeout(self, duration: SupportsFloat) -> None:
-        "Set time-out value for benchmark execution of this configuration, in seconds"
-        ...
-    def get_blocking_kernel_timeout(self) -> float:
-        "Get time-out value for execution of blocking kernel, in seconds"
-        ...
-    def set_blocking_kernel_timeout(self, duration: SupportsFloat) -> None:
-        "Set time-out value for execution of blocking kernel, in seconds"
-        ...
+    ) -> None: ...
+    def get_benchmark(self) -> Benchmark: ...
+    def get_throttle_threshold(self) -> float: ...
+    def set_throttle_threshold(self, threshold_fraction: SupportsFloat) -> None: ...
+    def get_min_samples(self) -> int: ...
+    def set_min_samples(self, min_samples_count: SupportsInt) -> None: ...
+    def get_disable_blocking_kernel(self) -> bool: ...
+    def set_disable_blocking_kernel(self, flag: bool) -> None: ...
+    def get_run_once(self) -> bool: ...
+    def set_run_once(self, run_once_flag: bool) -> None: ...
+    def get_timeout(self) -> float: ...
+    def set_timeout(self, duration: SupportsFloat) -> None: ...
+    def get_blocking_kernel_timeout(self) -> float: ...
+    def set_blocking_kernel_timeout(self, duration: SupportsFloat) -> None: ...
    def exec(
        self,
        fn: Callable[[Launch], None],
@@ -247,60 +103,16 @@ class State:
        *,
        batched: Optional[bool] = True,
        sync: Optional[bool] = False,
-    ):
-        """Execute callable running the benchmark.
-
-        The callable may be executed multiple times.
-
-        Parameters
-        ----------
-        fn: Callable
-            Python callable with signature fn(Launch) -> None that executes the benchmark.
-        batched: bool, optional
-            If `True`, no cache flushing is performed between callable invocations.
-            Default: `True`.
-        sync: bool, optional
-            True value indicates that callable performs device synchronization.
-            NVBench disables use of blocking kernel in this case.
-            Default: `False`.
-        """
-        ...
-    def get_short_description(self) -> str:
-        "Get short description for this configuration"
-        ...
+    ): ...
+    def get_short_description(self) -> str: ...
    def add_summary(
        self, column_name: str, value: Union[SupportsInt, SupportsFloat, str]
-    ) -> None:
-        "Add summary column with a value"
-        ...
-    def get_axis_values(self) -> dict[str, int | float | str]:
-        "Get dictionary with axis values for this configuration"
-        ...
-    def get_axis_values_as_string(self) -> str:
-        "Get string of space-separated name=value pairs for this configuration"
-        ...
-    def get_stopping_criterion(self) -> str:
-        "Get string name of stopping criterion used"
-        ...
+    ) -> None: ...
+    def get_axis_values(self) -> dict[str, int | float | str]: ...
+    def get_axis_values_as_string(self) -> str: ...
+    def get_stopping_criterion(self) -> str: ...

-def register(fn: Callable[[State], None]) -> Benchmark:
-    """
-    Register given benchmarking function with NVBench.
-    """
-    ...
+def register(fn: Callable[[State], None]) -> Benchmark: ...
+def run_all_benchmarks(argv: Sequence[str]) -> None: ...

-def run_all_benchmarks(argv: Sequence[str]) -> None:
-    """
-    Run all benchmarks registered with NVBench.
-
-    Parameters
-    ----------
-    argv: List[str]
-        Sequence of CLI arguments controlling NVBench. Usually, it is `sys.argv`.
-    """
-    ...
-
-class NVBenchRuntimeError(RuntimeError):
-    """An exception raised if running benchmarks encounters an error"""
-
-    ...
+class NVBenchRuntimeError(RuntimeError): ...
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
--- a/python/test/test_cuda_bench.py
+++ b/python/test/test_cuda_bench.py
@@ -37,3 +37,54 @@ def test_cpu_only():
    b.set_is_cpu_only(True)

    bench.run_all_benchmarks(["-q", "--profile"])
+
+
+def docstring_check(doc_str: str) -> None:
+    assert isinstance(doc_str, str)
+    assert len(doc_str) > 0
+
+
+def obj_has_docstring_check(o: object) -> None:
+    docstring_check(o.__doc__)
+
+
+def test_module_doc():
+    obj_has_docstring_check(bench)
+
+
+def test_register_doc():
+    obj_has_docstring_check(bench.register)
+
+
+def test_run_all_benchmarks_doc():
+    obj_has_docstring_check(bench.run_all_benchmarks)
+
+
+def test_State_doc():
+    cl = bench.State
+    obj_has_docstring_check(cl)
+    obj_has_docstring_check(cl.exec)
+    obj_has_docstring_check(cl.get_int64)
+    obj_has_docstring_check(cl.get_float64)
+    obj_has_docstring_check(cl.get_string)
+    obj_has_docstring_check(cl.skip)
+
+
+def test_Launch_doc():
+    cl = bench.Launch
+    obj_has_docstring_check(cl)
+    obj_has_docstring_check(cl.get_stream)
+
+
+def test_CudaStream_doc():
+    cl = bench.CudaStream
+    obj_has_docstring_check(cl)
+
+
+def test_Benchmark_doc():
+    cl = bench.Benchmark
+    obj_has_docstring_check(cl)
+    obj_has_docstring_check(cl.add_int64_axis)
+    obj_has_docstring_check(cl.add_int64_power_of_two_axis)
+    obj_has_docstring_check(cl.add_float64_axis)
+    obj_has_docstring_check(cl.add_string_axis)