diff --git a/python/examples/cccl_parallel_segmented_reduce.py b/python/examples/cccl_parallel_segmented_reduce.py index e54a77b..d2140b9 100644 --- a/python/examples/cccl_parallel_segmented_reduce.py +++ b/python/examples/cccl_parallel_segmented_reduce.py @@ -17,36 +17,20 @@ import sys import cuda.bench as bench -import cuda.cccl.parallel.experimental.algorithms as algorithms -import cuda.cccl.parallel.experimental.iterators as iterators -import cuda.core.experimental as core +import cuda.compute.algorithms as algorithms +import cuda.compute.iterators as iterators +import cuda.core as core import cupy as cp import numpy as np - - -class CCCLStream: - "Class to work around https://github.com/NVIDIA/cccl/issues/5144" - - def __init__(self, ptr): - self._ptr = ptr - - def __cuda_stream__(self): - return (0, self._ptr) +from cuda.compute import OpKind def as_core_Stream(cs: bench.CudaStream) -> core.Stream: return core.Stream.from_handle(cs.addressof()) -def as_cccl_Stream(cs: bench.CudaStream) -> CCCLStream: - return CCCLStream(cs.addressof()) - - -def as_cp_ExternalStream( - cs: bench.CudaStream, dev_id: int | None = -1 -) -> cp.cuda.ExternalStream: - h = cs.addressof() - return cp.cuda.ExternalStream(h, dev_id) +def as_cp_ExternalStream(cs: bench.CudaStream) -> cp.cuda.ExternalStream: + return cp.cuda.Stream.from_external(cs) def segmented_reduce(state: bench.State): @@ -56,13 +40,8 @@ def segmented_reduce(state: bench.State): n_rows = n_elems // n_cols state.add_summary("numRows", n_rows) - state.collect_cupti_metrics() - dev_id = state.get_device() - cp_stream = as_cp_ExternalStream(state.get_stream(), dev_id) - - def add_op(a, b): - return a + b + cp_stream = as_cp_ExternalStream(state.get_stream()) def make_scaler(step): def scale(row_id): @@ -85,15 +64,24 @@ def segmented_reduce(state: bench.State): d_input = mat d_output = cp.empty(n_rows, dtype=d_input.dtype) - alg = algorithms.segmented_reduce( + add_op = OpKind.PLUS + + alg = algorithms.make_segmented_reduce( d_input, d_output, start_offsets, end_offsets, add_op, h_init ) - cccl_stream = as_cccl_Stream(state.get_stream()) - + cccl_stream = state.get_stream() # query size of temporary storage and allocate temp_nbytes = alg( - None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init, cccl_stream + None, + d_input, + d_output, + add_op, + n_rows, + start_offsets, + end_offsets, + h_init, + cccl_stream, ) h_init = np.zeros(tuple(), dtype=np.int32) @@ -101,11 +89,12 @@ def segmented_reduce(state: bench.State): temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8) def launcher(launch: bench.Launch): - s = as_cccl_Stream(launch.get_stream()) + s = launch.get_stream() alg( temp_storage, d_input, d_output, + add_op, n_rows, start_offsets, end_offsets,