From d13a0fde32147478d19dc8eaf60f184321c7ea3b Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 6 May 2026 13:30:44 -0500 Subject: [PATCH] Correct cuda cccl examples per change in api (#353) --- .../examples/cuda_compute_segmented_reduce.py | 43 +++++++++++-------- python/examples/cuda_coop_block_reduce.py | 4 +- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/python/examples/cuda_compute_segmented_reduce.py b/python/examples/cuda_compute_segmented_reduce.py index d2140b9..d1b53f2 100644 --- a/python/examples/cuda_compute_segmented_reduce.py +++ b/python/examples/cuda_compute_segmented_reduce.py @@ -67,21 +67,26 @@ def segmented_reduce(state: bench.State): add_op = OpKind.PLUS alg = algorithms.make_segmented_reduce( - d_input, d_output, start_offsets, end_offsets, add_op, h_init + d_in=d_input, + d_out=d_output, + start_offsets_in=start_offsets, + end_offsets_in=end_offsets, + op=add_op, + h_init=h_init, ) cccl_stream = state.get_stream() # query size of temporary storage and allocate temp_nbytes = alg( - None, - d_input, - d_output, - add_op, - n_rows, - start_offsets, - end_offsets, - h_init, - cccl_stream, + temp_storage=None, + d_in=d_input, + d_out=d_output, + op=add_op, + num_segments=n_rows, + start_offsets_in=start_offsets, + end_offsets_in=end_offsets, + h_init=h_init, + stream=cccl_stream, ) h_init = np.zeros(tuple(), dtype=np.int32) @@ -91,15 +96,15 @@ def segmented_reduce(state: bench.State): def launcher(launch: bench.Launch): s = launch.get_stream() alg( - temp_storage, - d_input, - d_output, - add_op, - n_rows, - start_offsets, - end_offsets, - h_init, - s, + temp_storage=temp_storage, + d_in=d_input, + d_out=d_output, + op=add_op, + num_segments=n_rows, + start_offsets_in=start_offsets, + end_offsets_in=end_offsets, + h_init=h_init, + stream=s, ) state.exec(launcher) diff --git a/python/examples/cuda_coop_block_reduce.py b/python/examples/cuda_coop_block_reduce.py index c0f0138..349db82 100644 --- a/python/examples/cuda_coop_block_reduce.py +++ b/python/examples/cuda_coop_block_reduce.py @@ -58,7 +58,9 @@ def multi_block_bench(state: bench.State): return ring = BitsetRing() - block_reduce = coop.block.reduce(numba.uint64, threads_per_block, BitsetRing.add) + block_reduce = coop.block.make_reduce( + numba.uint64, threads_per_block, BitsetRing.add + ) @cuda.jit(link=block_reduce.files) def kernel(inp_arr, out_arr):