From 6df5fc8c677efced31113b134ff0b35fc1430c7a Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Mon, 2 Feb 2026 15:37:13 -0600 Subject: [PATCH 1/3] Remove cupti from cuda-bench dependencies --- python/cuda/bench/__init__.py | 8 -------- python/cuda/bench/__init__.pyi | 6 ------ python/examples/auto_throughput.py | 1 - python/pyproject.toml | 8 ++------ python/src/py_nvbench.cpp | 6 ------ 5 files changed, 2 insertions(+), 27 deletions(-) diff --git a/python/cuda/bench/__init__.py b/python/cuda/bench/__init__.py index c02c14c..f3f7201 100644 --- a/python/cuda/bench/__init__.py +++ b/python/cuda/bench/__init__.py @@ -18,10 +18,6 @@ import importlib import importlib.metadata import warnings -from cuda.pathfinder import ( # type: ignore[import-not-found] - load_nvidia_dynamic_lib, -) - try: __version__ = importlib.metadata.version("cuda-bench") except Exception as e: @@ -65,10 +61,6 @@ except ImportError as e: f"Original error: {e}" ) -# Load required NVIDIA libraries -for libname in ("cupti", "nvperf_target", "nvperf_host"): - load_nvidia_dynamic_lib(libname) - # Import and expose all public symbols from the CUDA-specific extension Benchmark = _nvbench_module.Benchmark CudaStream = _nvbench_module.CudaStream diff --git a/python/cuda/bench/__init__.pyi b/python/cuda/bench/__init__.pyi index 86681fc..25f7fca 100644 --- a/python/cuda/bench/__init__.pyi +++ b/python/cuda/bench/__init__.pyi @@ -240,12 +240,6 @@ class State: def set_blocking_kernel_timeout(self, duration: SupportsFloat) -> None: "Set time-out value for execution of blocking kernel, in seconds" ... - def collect_cupti_metrics(self) -> None: - "Request NVBench to record CUPTI metrics while running benchmark for this configuration" - ... - def is_cupti_required(self) -> bool: - "True if (some) CUPTI metrics are being collected" - ... def exec( self, fn: Callable[[Launch], None], diff --git a/python/examples/auto_throughput.py b/python/examples/auto_throughput.py index db4fa19..5d41b09 100644 --- a/python/examples/auto_throughput.py +++ b/python/examples/auto_throughput.py @@ -51,7 +51,6 @@ def throughput_bench(state: bench.State) -> None: out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream) state.add_element_count(elements, column_name="Elements") - state.collect_cupti_metrics() threads_per_block = 256 blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block diff --git a/python/pyproject.toml b/python/pyproject.toml index 4d288ac..f7ddf78 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -21,19 +21,15 @@ classifiers = [ "Operating System :: POSIX :: Linux", ] requires-python = ">=3.10" -dependencies = [ - # pathfinder for finding CUDA libraries - "cuda-pathfinder", -] dynamic = ["version"] readme = { file = "README.md", content-type = "text/markdown" } [project.optional-dependencies] # CUDA 12.x dependencies -cu12 = ["cuda-bindings>=12.0.0,<13.0.0", "nvidia-cuda-cupti-cu12"] +cu12 = ["cuda-bindings>=12.0.0,<13.0.0"] # CUDA 13.x dependencies -cu13 = ["cuda-bindings>=13.0.0,<14.0.0", "nvidia-cuda-cupti>=13.0"] +cu13 = ["cuda-bindings>=13.0.0,<14.0.0"] # Test dependencies for CUDA 12 test-cu12 = ["cuda-bench[cu12]", "pytest", "cupy-cuda12x", "numba"] diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp index 68a2f26..67c20e3e 100644 --- a/python/src/py_nvbench.cpp +++ b/python/src/py_nvbench.cpp @@ -550,13 +550,11 @@ PYBIND11_MODULE(PYBIND11_MODULE_NAME, m) // nvbench::state::collect_stores_efficiency // nvbench::state::collect_loads_efficiency // nvbench::state::collect_dram_throughput - // nvbench::state::collect_cupti_metrics // nvbench::state::is_l1_hit_rate_collected // nvbench::state::is_l2_hit_rate_collected // nvbench::state::is_stores_efficiency_collected // nvbench::state::is_loads_efficiency_collected // nvbench::state::is_dram_throughput_collected - // nvbench::state::is_cupti_required // nvbench::state::add_summary // nvbench::state::get_summary // nvbench::state::get_summaries @@ -678,10 +676,6 @@ PYBIND11_MODULE(PYBIND11_MODULE_NAME, m) &nvbench::state::set_blocking_kernel_timeout, py::arg("duration")); - pystate_cls.def("collect_cupti_metrics", &nvbench::state::collect_cupti_metrics); - - pystate_cls.def("is_cupti_required", &nvbench::state::is_cupti_required); - pystate_cls.def( "exec", [](nvbench::state &state, py::object py_launcher_fn, bool batched, bool sync) { From f2d57301043ed712c6830213111fadd00d95876b Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Mon, 2 Feb 2026 16:03:15 -0600 Subject: [PATCH 2/3] Disable CUPTI in cmake file --- python/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 7f8548c..6d37350 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -19,7 +19,9 @@ include(${_cpm_download_location}) CPMAddPackage( NAME nvbench SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/.. - OPTIONS "NVBench_INSTALL_RULES ON" + OPTIONS + "NVBench_INSTALL_RULES ON" + "NVBench_ENABLE_CUPTI OFF" FIND_PACKAGE_ARGS CONFIG REQUIRED ) From 4fa4296810b97f4349f954e588fcaecb6b47f945 Mon Sep 17 00:00:00 2001 From: Nader Al Awar Date: Mon, 2 Feb 2026 16:43:45 -0600 Subject: [PATCH 3/3] Remove cuda.pathfinder function --- python/cuda/bench/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cuda/bench/__init__.py b/python/cuda/bench/__init__.py index f3f7201..2a3aa0a 100644 --- a/python/cuda/bench/__init__.py +++ b/python/cuda/bench/__init__.py @@ -77,7 +77,6 @@ _nvbench = _nvbench_module # Clean up internal symbols del ( - load_nvidia_dynamic_lib, _nvbench_module, _cuda_major, _extra_name,