cuda.nvbench -> cuda.bench

Per PR review suggestion: - `cuda.parallel` - device-wide algorithms/Thrust - `cuda.cooperative` - Cooperative algorithsm/CUB - `cuda.bench` - Benchmarking/NVBench
2026-04-20 06:48:53 +00:00 · 2025-08-04 13:42:43 -05:00
parent c2a2acc9b6
commit b5e4b4ba31
19 changed files with 136 additions and 140 deletions
--- a/python/examples/pytorch_bench.py
+++ b/python/examples/pytorch_bench.py
@@ -1,18 +1,18 @@
 import sys

-import cuda.nvbench as nvbench
+import cuda.bench as bench
 import torch


 def as_torch_cuda_Stream(
-    cs: nvbench.CudaStream, dev: int | None
+    cs: bench.CudaStream, dev: int | None
 ) -> torch.cuda.ExternalStream:
    return torch.cuda.ExternalStream(
        stream_ptr=cs.addressof(), device=torch.cuda.device(dev)
    )


-def torch_bench(state: nvbench.State) -> None:
+def torch_bench(state: bench.State) -> None:
    state.set_throttle_threshold(0.25)

    dev_id = state.get_device()
@@ -31,7 +31,7 @@ def torch_bench(state: nvbench.State) -> None:

    learning_rate = 1e-4

-    def launcher(launch: nvbench.Launch) -> None:
+    def launcher(launch: bench.Launch) -> None:
        tc_s = as_torch_cuda_Stream(launch.get_stream(), dev_id)
        with torch.cuda.stream(tc_s):
            x2 = torch.square(x)
@@ -53,6 +53,6 @@ def torch_bench(state: nvbench.State) -> None:


 if __name__ == "__main__":
-    nvbench.register(torch_bench)
+    bench.register(torch_bench)

-    nvbench.run_all_benchmarks(sys.argv)
+    bench.run_all_benchmarks(sys.argv)