Remove cupti from cuda-bench dependencies (#311)

2026-03-14 20:27:24 +00:00 · 2026-02-03 14:16:26 -06:00
parent 90ad8bcbc7 d75fc74162
commit dc59f98ecd
6 changed files with 5 additions and 35 deletions
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -19,7 +19,9 @@ include(${_cpm_download_location})
 CPMAddPackage(
   NAME nvbench
   SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/..
-   OPTIONS "NVBench_INSTALL_RULES ON"
+   OPTIONS
+     "NVBench_INSTALL_RULES ON"
+     "NVBench_ENABLE_CUPTI OFF"
   FIND_PACKAGE_ARGS CONFIG REQUIRED
 )

--- a/python/cuda/bench/init.py
+++ b/python/cuda/bench/init.py
@@ -18,10 +18,6 @@ import importlib
 import importlib.metadata
 import warnings

-from cuda.pathfinder import (  # type: ignore[import-not-found]
-    load_nvidia_dynamic_lib,
-)
-
 try:
    __version__ = importlib.metadata.version("cuda-bench")
 except Exception as e:
@@ -65,10 +61,6 @@ except ImportError as e:
        f"Original error: {e}"
    )

-# Load required NVIDIA libraries
-for libname in ("cupti", "nvperf_target", "nvperf_host"):
-    load_nvidia_dynamic_lib(libname)
-
 # Import and expose all public symbols from the CUDA-specific extension
 Benchmark = _nvbench_module.Benchmark
 CudaStream = _nvbench_module.CudaStream
@@ -85,7 +77,6 @@ _nvbench = _nvbench_module

 # Clean up internal symbols
 del (
-    load_nvidia_dynamic_lib,
    _nvbench_module,
    _cuda_major,
    _extra_name,
--- a/python/cuda/bench/init.pyi
+++ b/python/cuda/bench/init.pyi
@@ -96,8 +96,6 @@ class State:
    def set_timeout(self, duration: SupportsFloat) -> None: ...
    def get_blocking_kernel_timeout(self) -> float: ...
    def set_blocking_kernel_timeout(self, duration: SupportsFloat) -> None: ...
-    def collect_cupti_metrics(self) -> None: ...
-    def is_cupti_required(self) -> bool: ...
    def exec(
        self,
        fn: Callable[[Launch], None],
--- a/python/examples/auto_throughput.py
+++ b/python/examples/auto_throughput.py
@@ -51,7 +51,6 @@ def throughput_bench(state: bench.State) -> None:
    out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)

    state.add_element_count(elements, column_name="Elements")
-    state.collect_cupti_metrics()

    threads_per_block = 256
    blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -21,19 +21,15 @@ classifiers = [
  "Operating System :: POSIX :: Linux",
 ]
 requires-python = ">=3.10"
-dependencies = [
-  # pathfinder for finding CUDA libraries
-  "cuda-pathfinder",
-]
 dynamic = ["version"]
 readme = { file = "README.md", content-type = "text/markdown" }

 [project.optional-dependencies]
 # CUDA 12.x dependencies
-cu12 = ["cuda-bindings>=12.0.0,<13.0.0", "nvidia-cuda-cupti-cu12"]
+cu12 = ["cuda-bindings>=12.0.0,<13.0.0"]

 # CUDA 13.x dependencies
-cu13 = ["cuda-bindings>=13.0.0,<14.0.0", "nvidia-cuda-cupti>=13.0"]
+cu13 = ["cuda-bindings>=13.0.0,<14.0.0"]

 # Test dependencies for CUDA 12
 test-cu12 = ["cuda-bench[cu12]", "pytest", "cupy-cuda12x", "numba"]
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -676,13 +676,11 @@ void def_class_State(py::module_ m)
  //        nvbench::state::collect_stores_efficiency
  //        nvbench::state::collect_loads_efficiency
  //        nvbench::state::collect_dram_throughput
-  //        nvbench::state::collect_cupti_metrics
  //        nvbench::state::is_l1_hit_rate_collected
  //        nvbench::state::is_l2_hit_rate_collected
  //        nvbench::state::is_stores_efficiency_collected
  //        nvbench::state::is_loads_efficiency_collected
  //        nvbench::state::is_dram_throughput_collected
-  //        nvbench::state::is_cupti_required
  //        nvbench::state::add_summary
  //        nvbench::state::get_summary
  //        nvbench::state::get_summaries
@@ -972,20 +970,6 @@ Use argument True to disable use of blocking kernel by NVBench"
                  method_set_blocking_kernel_timeout_doc,
                  py::arg("duration_seconds"));

-  // method State.collect_cupti_metrics
-  static constexpr const char *method_collect_cupti_metrics_doc =
-    R"XXXX(Request NVBench to record CUPTI metrics while running benchmark for this configuration)XXXX";
-  pystate_cls.def("collect_cupti_metrics",
-                  &nvbench::state::collect_cupti_metrics,
-                  method_collect_cupti_metrics_doc);
-
-  // method State.is_cupti_required
-  static constexpr const char *method_is_cupti_required_doc =
-    R"XXXX(True if (some) CUPTI metrics are being collected)XXXX";
-  pystate_cls.def("is_cupti_required",
-                  &nvbench::state::is_cupti_required,
-                  method_is_cupti_required_doc);
-
  // method State.exec
  auto method_exec_impl =
    [](nvbench::state &state, py::object py_launcher_fn, bool batched, bool sync) -> void {