Update to cccl_parallel_segmented_reduce example per changes in API

Update namespace changes. Use make_segmented_reduce factory function, and update call signatures.
2026-05-12 09:15:47 +00:00 · 2026-04-01 08:18:15 -05:00
parent d8739fc208
commit 5bdb30f4b6
1 changed files with 22 additions and 33 deletions
--- a/python/examples/cccl_parallel_segmented_reduce.py
+++ b/python/examples/cccl_parallel_segmented_reduce.py
@@ -17,36 +17,20 @@
 import sys

 import cuda.bench as bench
-import cuda.cccl.parallel.experimental.algorithms as algorithms
-import cuda.cccl.parallel.experimental.iterators as iterators
-import cuda.core.experimental as core
+import cuda.compute.algorithms as algorithms
+import cuda.compute.iterators as iterators
+import cuda.core as core
 import cupy as cp
 import numpy as np
-
-
-class CCCLStream:
-    "Class to work around https://github.com/NVIDIA/cccl/issues/5144"
-
-    def __init__(self, ptr):
-        self._ptr = ptr
-
-    def __cuda_stream__(self):
-        return (0, self._ptr)
+from cuda.compute import OpKind


 def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
    return core.Stream.from_handle(cs.addressof())


-def as_cccl_Stream(cs: bench.CudaStream) -> CCCLStream:
-    return CCCLStream(cs.addressof())
-
-
-def as_cp_ExternalStream(
-    cs: bench.CudaStream, dev_id: int | None = -1
-) -> cp.cuda.ExternalStream:
-    h = cs.addressof()
-    return cp.cuda.ExternalStream(h, dev_id)
+def as_cp_ExternalStream(cs: bench.CudaStream) -> cp.cuda.ExternalStream:
+    return cp.cuda.Stream.from_external(cs)


 def segmented_reduce(state: bench.State):
@@ -56,13 +40,8 @@ def segmented_reduce(state: bench.State):
    n_rows = n_elems // n_cols

    state.add_summary("numRows", n_rows)
-    state.collect_cupti_metrics()

-    dev_id = state.get_device()
-    cp_stream = as_cp_ExternalStream(state.get_stream(), dev_id)
-
-    def add_op(a, b):
-        return a + b
+    cp_stream = as_cp_ExternalStream(state.get_stream())

    def make_scaler(step):
        def scale(row_id):
@@ -85,15 +64,24 @@ def segmented_reduce(state: bench.State):
        d_input = mat
        d_output = cp.empty(n_rows, dtype=d_input.dtype)

-    alg = algorithms.segmented_reduce(
+    add_op = OpKind.PLUS
+
+    alg = algorithms.make_segmented_reduce(
        d_input, d_output, start_offsets, end_offsets, add_op, h_init
    )

-    cccl_stream = as_cccl_Stream(state.get_stream())
-
+    cccl_stream = state.get_stream()
    # query size of temporary storage and allocate
    temp_nbytes = alg(
-        None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init, cccl_stream
+        None,
+        d_input,
+        d_output,
+        add_op,
+        n_rows,
+        start_offsets,
+        end_offsets,
+        h_init,
+        cccl_stream,
    )
    h_init = np.zeros(tuple(), dtype=np.int32)

@@ -101,11 +89,12 @@ def segmented_reduce(state: bench.State):
        temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)

    def launcher(launch: bench.Launch):
-        s = as_cccl_Stream(launch.get_stream())
+        s = launch.get_stream()
        alg(
            temp_storage,
            d_input,
            d_output,
+            add_op,
            n_rows,
            start_offsets,
            end_offsets,