Fix type annotations in cuda.nvbench, and in examples

2026-07-16 16:51:21 +00:00 · 2025-07-22 13:02:22 -05:00
parent 13ad115ca3
commit a535a1d173
6 changed files with 52 additions and 43 deletions
--- a/python/examples/auto_throughput.py
+++ b/python/examples/auto_throughput.py
@@ -15,7 +15,6 @@
 #  limitations under the License.

 import sys
-from collections.abc import Callable

 import cuda.nvbench as nvbench
 import numpy as np
@@ -26,7 +25,7 @@ def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
    return cuda.external_stream(cs.addressof())


-def make_kernel(items_per_thread: int) -> Callable:
+def make_kernel(items_per_thread: int) -> cuda.compiler.AutoJitCUDAKernel:
    @cuda.jit
    def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
        tid = cuda.grid(1)
--- a/python/examples/axes.py
+++ b/python/examples/axes.py
@@ -1,6 +1,6 @@
 import ctypes
 import sys
-from typing import Optional
+from typing import Dict, Optional, Tuple

 import cuda.cccl.headers as headers
 import cuda.core.experimental as core
@@ -134,7 +134,7 @@ def copy_sweep_grid_shape(state: nvbench.State):
 def copy_type_sweep(state: nvbench.State):
    type_id = state.get_int64("TypeID")

-    types_map = {
+    types_map: Dict[int, Tuple[type, str]] = {
        0: (ctypes.c_uint8, "cuda::std::uint8_t"),
        1: (ctypes.c_uint16, "cuda::std::uint16_t"),
        2: (ctypes.c_uint32, "cuda::std::uint32_t"),
@@ -148,7 +148,7 @@ def copy_type_sweep(state: nvbench.State):

    # Number of elements in 256MiB
    nbytes = 256 * 1024 * 1024
-    num_values = nbytes // ctypes.sizeof(value_ctype(0))
+    num_values = nbytes // ctypes.sizeof(value_ctype)

    state.add_element_count(num_values)
    state.add_global_memory_reads(nbytes)
--- a/python/examples/cccl_parallel_segmented_reduce.py
+++ b/python/examples/cccl_parallel_segmented_reduce.py
@@ -27,7 +27,7 @@ def as_cccl_Stream(cs: nvbench.CudaStream) -> CCCLStream:


 def as_cp_ExternalStream(
-    cs: nvbench.CudaStream, dev_id: int = -1
+    cs: nvbench.CudaStream, dev_id: int | None = -1
 ) -> cp.cuda.ExternalStream:
    h = cs.addressof()
    return cp.cuda.ExternalStream(h, dev_id)
--- a/python/examples/cupy_extract.py
+++ b/python/examples/cupy_extract.py
@@ -5,7 +5,7 @@ import cupy as cp


 def as_cp_ExternalStream(
-    cs: nvbench.CudaStream, dev_id: int = -1
+    cs: nvbench.CudaStream, dev_id: int | None = -1
 ) -> cp.cuda.ExternalStream:
    h = cs.addressof()
    return cp.cuda.ExternalStream(h, dev_id)
--- a/python/examples/throughput.py
+++ b/python/examples/throughput.py
@@ -15,7 +15,6 @@
 #  limitations under the License.

 import sys
-from collections.abc import Callable

 import cuda.nvbench as nvbench
 import numpy as np
@@ -26,7 +25,7 @@ def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
    return cuda.external_stream(cs.addressof())


-def make_kernel(items_per_thread: int) -> Callable:
+def make_kernel(items_per_thread: int) -> cuda.compiler.AutoJitCUDAKernel:
    @cuda.jit
    def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
        tid = cuda.grid(1)