From d13a0fde32147478d19dc8eaf60f184321c7ea3b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 6 May 2026 13:30:44 -0500
Subject: [PATCH] Correct cuda cccl examples per change in api (#353)

---
 .../examples/cuda_compute_segmented_reduce.py | 43 +++++++++++--------
 python/examples/cuda_coop_block_reduce.py     |  4 +-
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/python/examples/cuda_compute_segmented_reduce.py b/python/examples/cuda_compute_segmented_reduce.py
index d2140b9..d1b53f2 100644
--- a/python/examples/cuda_compute_segmented_reduce.py
+++ b/python/examples/cuda_compute_segmented_reduce.py
@@ -67,21 +67,26 @@ def segmented_reduce(state: bench.State):
     add_op = OpKind.PLUS
 
     alg = algorithms.make_segmented_reduce(
-        d_input, d_output, start_offsets, end_offsets, add_op, h_init
+        d_in=d_input,
+        d_out=d_output,
+        start_offsets_in=start_offsets,
+        end_offsets_in=end_offsets,
+        op=add_op,
+        h_init=h_init,
     )
 
     cccl_stream = state.get_stream()
     # query size of temporary storage and allocate
     temp_nbytes = alg(
-        None,
-        d_input,
-        d_output,
-        add_op,
-        n_rows,
-        start_offsets,
-        end_offsets,
-        h_init,
-        cccl_stream,
+        temp_storage=None,
+        d_in=d_input,
+        d_out=d_output,
+        op=add_op,
+        num_segments=n_rows,
+        start_offsets_in=start_offsets,
+        end_offsets_in=end_offsets,
+        h_init=h_init,
+        stream=cccl_stream,
     )
     h_init = np.zeros(tuple(), dtype=np.int32)
 
@@ -91,15 +96,15 @@ def segmented_reduce(state: bench.State):
     def launcher(launch: bench.Launch):
         s = launch.get_stream()
         alg(
-            temp_storage,
-            d_input,
-            d_output,
-            add_op,
-            n_rows,
-            start_offsets,
-            end_offsets,
-            h_init,
-            s,
+            temp_storage=temp_storage,
+            d_in=d_input,
+            d_out=d_output,
+            op=add_op,
+            num_segments=n_rows,
+            start_offsets_in=start_offsets,
+            end_offsets_in=end_offsets,
+            h_init=h_init,
+            stream=s,
         )
 
     state.exec(launcher)
diff --git a/python/examples/cuda_coop_block_reduce.py b/python/examples/cuda_coop_block_reduce.py
index c0f0138..349db82 100644
--- a/python/examples/cuda_coop_block_reduce.py
+++ b/python/examples/cuda_coop_block_reduce.py
@@ -58,7 +58,9 @@ def multi_block_bench(state: bench.State):
         return
 
     ring = BitsetRing()
-    block_reduce = coop.block.reduce(numba.uint64, threads_per_block, BitsetRing.add)
+    block_reduce = coop.block.make_reduce(
+        numba.uint64, threads_per_block, BitsetRing.add
+    )
 
     @cuda.jit(link=block_reduce.files)
     def kernel(inp_arr, out_arr):