diff --git a/python/examples/cccl_parallel_segmented_reduce.py b/python/examples/cccl_parallel_segmented_reduce.py
new file mode 100644
index 0000000..b9c1b66
--- /dev/null
+++ b/python/examples/cccl_parallel_segmented_reduce.py
@@ -0,0 +1,79 @@
+import sys
+
+import cuda.cccl.parallel.experimental.algorithms as algorithms
+import cuda.cccl.parallel.experimental.iterators as iterators
+import cuda.core.experimental as core
+import cuda.nvbench as nvbench
+import cupy as cp
+import numpy as np
+
+
+def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+    return core.Stream.from_handle(cs.addressof())
+
+
+def segmented_reduce(state: nvbench.State):
+    "Benchmark segmented_reduce example"
+    n_elems = state.getInt64("numElems")
+    n_cols = state.getInt64("numCols")
+    n_rows = n_elems // n_cols
+
+    state.add_summary("numRows", n_rows)
+    state.collectCUPTIMetrics()
+
+    rng = cp.random.default_rng()
+    mat = rng.integers(low=-31, high=32, dtype=np.int32, size=(n_rows, n_cols))
+
+    def add_op(a, b):
+        return a + b
+
+    def make_scaler(step):
+        def scale(row_id):
+            return row_id * step
+
+        return scale
+
+    zero = np.int32(0)
+    row_offset = make_scaler(np.int32(n_cols))
+    start_offsets = iterators.TransformIterator(
+        iterators.CountingIterator(zero), row_offset
+    )
+
+    end_offsets = start_offsets + 1
+
+    d_input = mat
+    h_init = np.zeros(tuple(), dtype=np.int32)
+    d_output = cp.empty(n_rows, dtype=d_input.dtype)
+
+    alg = algorithms.segmented_reduce(
+        d_input, d_output, start_offsets, end_offsets, add_op, h_init
+    )
+
+    # query size of temporary storage and allocate
+    temp_nbytes = alg(
+        None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init
+    )
+    temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
+
+    def launcher(launch: nvbench.Launch):
+        s = as_core_Stream(launch.getStream())
+        alg(
+            temp_storage,
+            d_input,
+            d_output,
+            n_rows,
+            start_offsets,
+            end_offsets,
+            h_init,
+            s,
+        )
+
+    state.exec(launcher)
+
+
+if __name__ == "__main__":
+    b = nvbench.register(segmented_reduce)
+    b.addInt64Axis("numElems", [2**20, 2**22, 2**24])
+    b.addInt64Axis("numCols", [1024, 2048, 4096, 8192])
+
+    nvbench.run_all_benchmarks(sys.argv)