Correct cuda cccl examples per change in api (#353)

2026-05-13 01:35:40 +00:00 · 2026-05-06 13:30:44 -05:00
parent f392725015
commit d13a0fde32
2 changed files with 27 additions and 20 deletions
--- a/python/examples/cuda_compute_segmented_reduce.py
+++ b/python/examples/cuda_compute_segmented_reduce.py
@@ -67,21 +67,26 @@ def segmented_reduce(state: bench.State):
    add_op = OpKind.PLUS

    alg = algorithms.make_segmented_reduce(
-        d_input, d_output, start_offsets, end_offsets, add_op, h_init
+        d_in=d_input,
+        d_out=d_output,
+        start_offsets_in=start_offsets,
+        end_offsets_in=end_offsets,
+        op=add_op,
+        h_init=h_init,
    )

    cccl_stream = state.get_stream()
    # query size of temporary storage and allocate
    temp_nbytes = alg(
-        None,
-        d_input,
-        d_output,
-        add_op,
-        n_rows,
-        start_offsets,
-        end_offsets,
-        h_init,
-        cccl_stream,
+        temp_storage=None,
+        d_in=d_input,
+        d_out=d_output,
+        op=add_op,
+        num_segments=n_rows,
+        start_offsets_in=start_offsets,
+        end_offsets_in=end_offsets,
+        h_init=h_init,
+        stream=cccl_stream,
    )
    h_init = np.zeros(tuple(), dtype=np.int32)

@@ -91,15 +96,15 @@ def segmented_reduce(state: bench.State):
    def launcher(launch: bench.Launch):
        s = launch.get_stream()
        alg(
-            temp_storage,
-            d_input,
-            d_output,
-            add_op,
-            n_rows,
-            start_offsets,
-            end_offsets,
-            h_init,
-            s,
+            temp_storage=temp_storage,
+            d_in=d_input,
+            d_out=d_output,
+            op=add_op,
+            num_segments=n_rows,
+            start_offsets_in=start_offsets,
+            end_offsets_in=end_offsets,
+            h_init=h_init,
+            stream=s,
        )

    state.exec(launcher)
--- a/python/examples/cuda_coop_block_reduce.py
+++ b/python/examples/cuda_coop_block_reduce.py
@@ -58,7 +58,9 @@ def multi_block_bench(state: bench.State):
        return

    ring = BitsetRing()
-    block_reduce = coop.block.reduce(numba.uint64, threads_per_block, BitsetRing.add)
+    block_reduce = coop.block.make_reduce(
+        numba.uint64, threads_per_block, BitsetRing.add
+    )

    @cuda.jit(link=block_reduce.files)
    def kernel(inp_arr, out_arr):