Change test and examples from using camelCase to using snake_case as implementation changed

2026-04-19 22:38:52 +00:00 · 2025-07-21 12:50:28 -05:00
parent 81fff085b9
commit e589518376
9 changed files with 101 additions and 105 deletions
--- a/python/examples/auto_throughput.py
+++ b/python/examples/auto_throughput.py
@@ -41,18 +41,18 @@ def make_kernel(items_per_thread: int) -> Callable:


 def throughput_bench(state: nvbench.State) -> None:
-    stride = state.getInt64("Stride")
-    ipt = state.getInt64("ItemsPerThread")
+    stride = state.get_int64("Stride")
+    ipt = state.get_int64("ItemsPerThread")

    nbytes = 128 * 1024 * 1024
    elements = nbytes // np.dtype(np.int32).itemsize

-    alloc_stream = as_cuda_Stream(state.getStream())
+    alloc_stream = as_cuda_Stream(state.get_stream())
    inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
    out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)

-    state.addElementCount(elements, column_name="Elements")
-    state.collectCUPTIMetrics()
+    state.add_element_count(elements, column_name="Elements")
+    state.collect_cupti_metrics()

    threads_per_block = 256
    blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
@@ -66,7 +66,7 @@ def throughput_bench(state: nvbench.State) -> None:
    )

    def launcher(launch: nvbench.Launch):
-        exec_stream = as_cuda_Stream(launch.getStream())
+        exec_stream = as_cuda_Stream(launch.get_stream())
        krn[blocks_in_grid, threads_per_block, exec_stream, 0](
            stride, elements, inp_arr, out_arr
        )
@@ -76,7 +76,7 @@ def throughput_bench(state: nvbench.State) -> None:

 if __name__ == "__main__":
    b = nvbench.register(throughput_bench)
-    b.addInt64Axis("Stride", [1, 2, 4])
-    b.addInt64Axis("ItemsPerThread", [1, 2, 3, 4])
+    b.add_int64_axis("Stride", [1, 2, 4])
+    b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])

    nvbench.run_all_benchmarks(sys.argv)
--- a/python/examples/axes.py
+++ b/python/examples/axes.py
@@ -43,13 +43,13 @@ __global__ void sleep_kernel(double seconds) {


 def simple(state: nvbench.State):
-    state.setMinSamples(1000)
+    state.set_min_samples(1000)
    sleep_dur = 1e-3
    krn = make_sleep_kernel()
    launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)

    def launcher(launch: nvbench.Launch):
-        s = as_core_Stream(launch.getStream())
+        s = as_core_Stream(launch.get_stream())
        core.launch(s, launch_config, krn, sleep_dur)

    state.exec(launcher)
@@ -57,12 +57,13 @@ def simple(state: nvbench.State):

 def single_float64_axis(state: nvbench.State):
    # get axis value, or default
-    sleep_dur = state.getFloat64("Duration", 3.14e-4)
+    default_sleep_dur = 3.14e-4
+    sleep_dur = state.get_float64("Duration", default_sleep_dur)
    krn = make_sleep_kernel()
    launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)

    def launcher(launch: nvbench.Launch):
-        s = as_core_Stream(launch.getStream())
+        s = as_core_Stream(launch.get_stream())
        core.launch(s, launch_config, krn, sleep_dur)

    state.exec(launcher)
@@ -104,19 +105,19 @@ __global__ void copy_kernel(const T *in, U *out, ::cuda::std::size_t n)


 def copy_sweep_grid_shape(state: nvbench.State):
-    block_size = state.getInt64("BlockSize")
-    num_blocks = state.getInt64("NumBlocks")
+    block_size = state.get_int64("BlockSize")
+    num_blocks = state.get_int64("NumBlocks")

    # Number of int32 elements in 256MiB
    nbytes = 256 * 1024 * 1024
    num_values = nbytes // ctypes.sizeof(ctypes.c_int32(0))

-    state.addElementCount(num_values)
-    state.addGlobalMemoryReads(nbytes)
-    state.addGlobalMemoryWrites(nbytes)
+    state.add_element_count(num_values)
+    state.add_global_memory_reads(nbytes)
+    state.add_global_memory_writes(nbytes)

-    dev_id = state.getDevice()
-    alloc_s = as_core_Stream(state.getStream())
+    dev_id = state.get_device()
+    alloc_s = as_core_Stream(state.get_stream())
    input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
    output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)

@@ -124,20 +125,20 @@ def copy_sweep_grid_shape(state: nvbench.State):
    launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)

    def launcher(launch: nvbench.Launch):
-        s = as_core_Stream(launch.getStream())
+        s = as_core_Stream(launch.get_stream())
        core.launch(s, launch_config, krn, input_buf, output_buf, num_values)

    state.exec(launcher)


 def copy_type_sweep(state: nvbench.State):
-    type_id = state.getInt64("TypeID")
+    type_id = state.get_int64("TypeID")

    types_map = {
-        0: (ctypes.c_uint8, "::cuda::std::uint8_t"),
-        1: (ctypes.c_uint16, "::cuda::std::uint16_t"),
-        2: (ctypes.c_uint32, "::cuda::std::uint32_t"),
-        3: (ctypes.c_uint64, "::cuda::std::uint64_t"),
+        0: (ctypes.c_uint8, "cuda::std::uint8_t"),
+        1: (ctypes.c_uint16, "cuda::std::uint16_t"),
+        2: (ctypes.c_uint32, "cuda::std::uint32_t"),
+        3: (ctypes.c_uint64, "cuda::std::uint64_t"),
        4: (ctypes.c_float, "float"),
        5: (ctypes.c_double, "double"),
    }
@@ -149,12 +150,12 @@ def copy_type_sweep(state: nvbench.State):
    nbytes = 256 * 1024 * 1024
    num_values = nbytes // ctypes.sizeof(value_ctype(0))

-    state.addElementCount(num_values)
-    state.addGlobalMemoryReads(nbytes)
-    state.addGlobalMemoryWrites(nbytes)
+    state.add_element_count(num_values)
+    state.add_global_memory_reads(nbytes)
+    state.add_global_memory_writes(nbytes)

-    dev_id = state.getDevice()
-    alloc_s = as_core_Stream(state.getStream())
+    dev_id = state.get_device()
+    alloc_s = as_core_Stream(state.get_stream())
    input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
    output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)

@@ -162,7 +163,7 @@ def copy_type_sweep(state: nvbench.State):
    launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)

    def launcher(launch: nvbench.Launch):
-        s = as_core_Stream(launch.getStream())
+        s = as_core_Stream(launch.get_stream())
        core.launch(s, launch_config, krn, input_buf, output_buf, num_values)

    state.exec(launcher)
@@ -175,13 +176,15 @@ if __name__ == "__main__":
    # benchmark with no axes, that uses default value
    nvbench.register(default_value)
    # specify axis
-    nvbench.register(single_float64_axis).addFloat64Axis("Duration", [7e-5, 1e-4, 5e-4])
+    nvbench.register(single_float64_axis).add_float64_axis(
+        "Duration", [7e-5, 1e-4, 5e-4]
+    )

    copy1_bench = nvbench.register(copy_sweep_grid_shape)
-    copy1_bench.addInt64Axis("BlockSize", [2**x for x in range(6, 10, 2)])
-    copy1_bench.addInt64Axis("NumBlocks", [2**x for x in range(6, 10, 2)])
+    copy1_bench.add_int64_axis("BlockSize", [2**x for x in range(6, 10, 2)])
+    copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)])

    copy2_bench = nvbench.register(copy_type_sweep)
-    copy2_bench.addInt64Axis("TypeID", range(0, 6))
+    copy2_bench.add_int64_axis("TypeID", range(0, 6))

    nvbench.run_all_benchmarks(sys.argv)
--- a/python/examples/cccl_parallel_segmented_reduce.py
+++ b/python/examples/cccl_parallel_segmented_reduce.py
@@ -35,15 +35,15 @@ def as_cp_ExternalStream(

 def segmented_reduce(state: nvbench.State):
    "Benchmark segmented_reduce example"
-    n_elems = state.getInt64("numElems")
-    n_cols = state.getInt64("numCols")
+    n_elems = state.get_int64("numElems")
+    n_cols = state.get_int64("numCols")
    n_rows = n_elems // n_cols

    state.add_summary("numRows", n_rows)
-    state.collectCUPTIMetrics()
+    state.collect_cupti_metrics()

-    dev_id = state.getDevice()
-    cp_stream = as_cp_ExternalStream(state.getStream(), dev_id)
+    dev_id = state.get_device()
+    cp_stream = as_cp_ExternalStream(state.get_stream(), dev_id)

    with cp_stream:
        rng = cp.random.default_rng()
@@ -75,20 +75,19 @@ def segmented_reduce(state: nvbench.State):
        d_input, d_output, start_offsets, end_offsets, add_op, h_init
    )

-    # print(1)
-    cccl_stream = as_cccl_Stream(state.getStream())
-    # print(2, core_stream, core_stream.__cuda_stream__())
+    cccl_stream = as_cccl_Stream(state.get_stream())
+
    # query size of temporary storage and allocate
    temp_nbytes = alg(
        None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init, cccl_stream
    )
    h_init = np.zeros(tuple(), dtype=np.int32)
-    # print(3)
+
    with cp_stream:
        temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)

    def launcher(launch: nvbench.Launch):
-        s = as_cccl_Stream(launch.getStream())
+        s = as_cccl_Stream(launch.get_stream())
        alg(
            temp_storage,
            d_input,
@@ -105,7 +104,7 @@ def segmented_reduce(state: nvbench.State):

 if __name__ == "__main__":
    b = nvbench.register(segmented_reduce)
-    b.addInt64Axis("numElems", [2**20, 2**22, 2**24])
-    b.addInt64Axis("numCols", [1024, 2048, 4096, 8192])
+    b.add_int64_axis("numElems", [2**20, 2**22, 2**24])
+    b.add_int64_axis("numCols", [1024, 2048, 4096, 8192])

    nvbench.run_all_benchmarks(sys.argv)
--- a/python/examples/cpu_only.py
+++ b/python/examples/cpu_only.py
@@ -13,6 +13,6 @@ def throughput_bench(state: nvbench.State) -> None:

 if __name__ == "__main__":
    b = nvbench.register(throughput_bench)
-    b.setIsCPUOnly(True)
+    b.set_is_cpu_only(True)

    nvbench.run_all_benchmarks(sys.argv)
--- a/python/examples/cupy_extract.py
+++ b/python/examples/cupy_extract.py
@@ -12,18 +12,18 @@ def as_cp_ExternalStream(


 def cupy_extract_by_mask(state: nvbench.State):
-    n_cols = state.getInt64("numCols")
-    n_rows = state.getInt64("numRows")
+    n_cols = state.get_int64("numCols")
+    n_rows = state.get_int64("numRows")

-    dev_id = state.getDevice()
-    cp_s = as_cp_ExternalStream(state.getStream(), dev_id)
+    dev_id = state.get_device()
+    cp_s = as_cp_ExternalStream(state.get_stream(), dev_id)

-    state.collectCUPTIMetrics()
-    state.addElementCount(n_rows * n_cols, "# Elements")
-    state.addGlobalMemoryReads(
+    state.collect_cupti_metrics()
+    state.add_element_count(n_rows * n_cols, "# Elements")
+    state.add_global_memory_reads(
        n_rows * n_cols * (cp.dtype(cp.int32).itemsize + cp.dtype("?").itemsize)
    )
-    state.addGlobalMemoryWrites(n_rows * n_cols * (cp.dtype(cp.int32).itemsize))
+    state.add_global_memory_writes(n_rows * n_cols * (cp.dtype(cp.int32).itemsize))

    with cp_s:
        X = cp.full((n_cols, n_rows), fill_value=3, dtype=cp.int32)
@@ -31,7 +31,7 @@ def cupy_extract_by_mask(state: nvbench.State):
        _ = X[mask]

    def launcher(launch: nvbench.Launch):
-        with as_cp_ExternalStream(launch.getStream(), dev_id):
+        with as_cp_ExternalStream(launch.get_stream(), dev_id):
            _ = X[mask]

    state.exec(launcher, sync=True)
@@ -39,7 +39,7 @@ def cupy_extract_by_mask(state: nvbench.State):

 if __name__ == "__main__":
    b = nvbench.register(cupy_extract_by_mask)
-    b.addInt64Axis("numCols", [1024, 2048, 4096, 2 * 4096])
-    b.addInt64Axis("numRows", [1024, 2048, 4096, 2 * 4096])
+    b.add_int64_axis("numCols", [1024, 2048, 4096, 2 * 4096])
+    b.add_int64_axis("numRows", [1024, 2048, 4096, 2 * 4096])

    nvbench.run_all_benchmarks(sys.argv)
--- a/python/examples/exec_tag_sync.py
+++ b/python/examples/exec_tag_sync.py
@@ -45,17 +45,17 @@ def synchronizing_bench(state: nvbench.State):
    n_values = 64 * 1024 * 1024
    n_bytes = n_values * ctypes.sizeof(ctypes.c_int32(0))

-    alloc_s = as_core_Stream(state.getStream())
-    buffer = core.DeviceMemoryResource(state.getDevice()).allocate(n_bytes, alloc_s)
+    alloc_s = as_core_Stream(state.get_stream())
+    buffer = core.DeviceMemoryResource(state.get_device()).allocate(n_bytes, alloc_s)

-    state.addElementCount(n_values, "Items")
-    state.addGlobalMemoryWrites(n_bytes, "Size")
+    state.add_element_count(n_values, "Items")
+    state.add_global_memory_writes(n_bytes, "Size")

    krn = make_fill_kernel()
    launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)

    def launcher(launch: nvbench.Launch):
-        s = as_core_Stream(launch.getStream())
+        s = as_core_Stream(launch.get_stream())
        core.launch(s, launch_config, krn, buffer, 0, n_values)
        s.sync()

--- a/python/examples/skip.py
+++ b/python/examples/skip.py
@@ -42,8 +42,8 @@ __global__ void sleep_kernel(double seconds) {


 def runtime_skip(state: nvbench.State):
-    duration = state.getFloat64("Duration")
-    kramble = state.getString("Kramble")
+    duration = state.get_float64("Duration")
+    kramble = state.get_string("Kramble")

    # Skip Baz benchmarks with 0.8 ms duration
    if kramble == "Baz" and duration < 0.8e-3:
@@ -59,7 +59,7 @@ def runtime_skip(state: nvbench.State):
    launch_cfg = core.LaunchConfig(grid=1, block=1, shmem_size=0)

    def launcher(launch: nvbench.Launch):
-        s = as_core_Stream(launch.getStream())
+        s = as_core_Stream(launch.get_stream())
        core.launch(s, launch_cfg, krn, duration)

    state.exec(launcher)
@@ -67,7 +67,7 @@ def runtime_skip(state: nvbench.State):

 if __name__ == "__main__":
    b = nvbench.register(runtime_skip)
-    b.addFloat64Axis("Duration", [1e-4 + k * 0.25e-3 for k in range(5)])
-    b.addStringAxis("Kramble", ["Foo", "Bar", "Baz"])
+    b.add_float64_axis("Duration", [1e-4 + k * 0.25e-3 for k in range(5)])
+    b.add_string_axis("Kramble", ["Foo", "Bar", "Baz"])

    nvbench.run_all_benchmarks(sys.argv)
--- a/python/examples/throughput.py
+++ b/python/examples/throughput.py
@@ -41,33 +41,27 @@ def make_kernel(items_per_thread: int) -> Callable:


 def throughput_bench(state: nvbench.State) -> None:
-    stride = state.getInt64("Stride")
-    ipt = state.getInt64("ItemsPerThread")
+    stride = state.get_int64("Stride")
+    ipt = state.get_int64("ItemsPerThread")

    nbytes = 128 * 1024 * 1024
    elements = nbytes // np.dtype(np.int32).itemsize

-    alloc_stream = as_cuda_Stream(state.getStream())
+    alloc_stream = as_cuda_Stream(state.get_stream())
    inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
    out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)

-    state.addElementCount(elements, column_name="Elements")
-    state.addGlobalMemoryReads(inp_arr.nbytes, column_name="Datasize")
-    state.addGlobalMemoryWrites(inp_arr.nbytes)
+    state.add_element_count(elements, column_name="Elements")
+    state.add_global_memory_reads(inp_arr.nbytes, column_name="Datasize")
+    state.add_global_memory_writes(inp_arr.nbytes)

    threads_per_block = 256
    blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block

    krn = make_kernel(ipt)

-    # warm-up call ensures that kernel is loaded into context
-    # before blocking kernel is launched
-    krn[blocks_in_grid, threads_per_block, alloc_stream, 0](
-        stride, elements, inp_arr, out_arr
-    )
-
    def launcher(launch: nvbench.Launch):
-        exec_stream = as_cuda_Stream(launch.getStream())
+        exec_stream = as_cuda_Stream(launch.get_stream())
        krn[blocks_in_grid, threads_per_block, exec_stream, 0](
            stride, elements, inp_arr, out_arr
        )
@@ -77,7 +71,7 @@ def throughput_bench(state: nvbench.State) -> None:

 if __name__ == "__main__":
    b = nvbench.register(throughput_bench)
-    b.addInt64Axis("Stride", [1, 2, 4])
-    b.addInt64Axis("ItemsPerThread", [1, 2, 3, 4])
+    b.add_int64_axis("Stride", [1, 2, 4])
+    b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])

    nvbench.run_all_benchmarks(sys.argv)
--- a/python/test/run_1.py
+++ b/python/test/run_1.py
@@ -14,18 +14,18 @@ def kernel(a, b, c):
        c[tid] = a[tid] + b[tid]


-def getNumbaStream(launch):
-    return cuda.external_stream(launch.getStream().addressof())
+def get_numba_stream(launch):
+    return cuda.external_stream(launch.get_stream().addressof())


 def add_two(state):
    # state.skip("Skipping this benchmark for no reason")
-    N = state.getInt64("elements")
+    N = state.get_int64("elements")
    a = cuda.to_device(np.random.random(N))
    c = cuda.device_array_like(a)

-    state.addGlobalMemoryReads(a.nbytes)
-    state.addGlobalMemoryWrites(c.nbytes)
+    state.add_global_memory_reads(a.nbytes)
+    state.add_global_memory_writes(c.nbytes)

    nthreads = 256
    nblocks = (len(a) + nthreads - 1) // nthreads
@@ -35,22 +35,22 @@ def add_two(state):
    cuda.synchronize()

    def kernel_launcher(launch):
-        stream = getNumbaStream(launch)
+        stream = get_numba_stream(launch)
        kernel[nblocks, nthreads, stream](a, a, c)

    state.exec(kernel_launcher, batched=True, sync=True)


 def add_float(state):
-    N = state.getInt64("elements")
-    v = state.getFloat64("v")
-    name = state.getString("name")
+    N = state.get_int64("elements")
+    v = state.get_gloat64("v")
+    name = state.get_string("name")
    a = cuda.to_device(np.random.random(N).astype(np.float32))
    b = cuda.to_device(np.random.random(N).astype(np.float32))
    c = cuda.device_array_like(a)

-    state.addGlobalMemoryReads(a.nbytes + b.nbytes)
-    state.addGlobalMemoryWrites(c.nbytes)
+    state.add_global_memory_reads(a.nbytes + b.nbytes)
+    state.add_global_memory_writes(c.nbytes)

    nthreads = 64
    nblocks = (len(a) + nthreads - 1) // nthreads
@@ -58,26 +58,26 @@ def add_float(state):
    def kernel_launcher(launch):
        _ = v
        _ = name
-        stream = getNumbaStream(launch)
+        stream = get_numba_stream(launch)
        kernel[nblocks, nthreads, stream](a, b, c)

    state.exec(kernel_launcher, batched=True, sync=True)


 def add_three(state):
-    N = state.getInt64("elements")
+    N = state.get_int64("elements")
    a = cuda.to_device(np.random.random(N).astype(np.float32))
    b = cuda.to_device(np.random.random(N).astype(np.float32))
    c = cuda.device_array_like(a)

-    state.addGlobalMemoryReads(a.nbytes + b.nbytes)
-    state.addGlobalMemoryWrites(c.nbytes)
+    state.add_global_memory_reads(a.nbytes + b.nbytes)
+    state.add_global_memory_writes(c.nbytes)

    nthreads = 256
    nblocks = (len(a) + nthreads - 1) // nthreads

    def kernel_launcher(launch):
-        stream = getNumbaStream(launch)
+        stream = get_numba_stream(launch)
        kernel[nblocks, nthreads, stream](a, b, c)

    state.exec(kernel_launcher, batched=True, sync=True)
@@ -86,18 +86,18 @@ def add_three(state):

 def register_benchmarks():
    (
-        nvbench.register(add_two).addInt64Axis(
+        nvbench.register(add_two).add_int64_axis(
            "elements", [2**pow2 for pow2 in range(20, 23)]
        )
    )
    (
        nvbench.register(add_float)
-        .addFloat64Axis("v", [0.1, 0.3])
-        .addStringAxis("name", ["Anne", "Lynda"])
-        .addInt64Axis("elements", [2**pow2 for pow2 in range(20, 23)])
+        .add_float64_axis("v", [0.1, 0.3])
+        .add_string_axis("name", ["Anne", "Lynda"])
+        .add_int64_axis("elements", [2**pow2 for pow2 in range(20, 23)])
    )
    (
-        nvbench.register(add_three).addInt64Axis(
+        nvbench.register(add_three).add_int64_axis(
            "elements", [2**pow2 for pow2 in range(20, 22)]
        )
    )