remove cute dsl pdl example.

2026-04-20 14:59:01 +00:00 · 2025-11-09 21:47:00 -08:00
parent 2252254ce2
commit 06b6bd7d7b
2 changed files with 0 additions and 382 deletions
--- a/examples/python/CuTeDSL/blackwell/programmatic_dependent_launch.py
+++ b/examples/python/CuTeDSL/blackwell/programmatic_dependent_launch.py
@@ -1,381 +0,0 @@
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-import argparse
-import cuda.bindings.driver as cuda
-
-import torch
-
-import cutlass
-import cutlass.cute as cute
-import cutlass.cute.testing as testing
-from cutlass.cute.runtime import from_dlpack
-
-
-def supports_pdl():
-    return torch.cuda.get_device_capability()[0] >= 9
-
-
-"""
-This example demonstrates the use of Programmatic Dependent Launch (PDL) using
-CuTe DSL.
-
-PDL is a mechanism which allows for overlapping execution of back-to-back kernels
-within the same stream.
-For example, consider the following two elementwise add operations, where the second
-operation's first operand is the result of the first operation. While performing
-``w = u + v`` we will load u and v, add them, and then store the result. Once we
-have finished loading data, we are no longer utilizing the read bandwidth.
-To effectively utilize the read bandwidth, we can start loading ``x``
-immediately upon finishing reading. This is what PDL enables us to do.
-
-.. code-block:: bash
-
-w = u + v
-y = w + x
-
-To enable PDL, we need to do two things:
-
-1. Insert the ``griddepcontrol.launch_dependents`` and ``griddepcontrol.wait`` instructions in the kernel.
-2. Set the PDL launch attribute when launching the kernel.
-
-The ``griddepcontrol.launch_dependents`` and ``griddepcontrol.wait``
-instructions enable fine-grained control over kernel execution in PDL.
-Once all thread blocks execute the ``griddepcontrol.launch_dependents``
-instruction, the dependent kernels can opportunistically be early-launched.
-``griddepcontrol.wait`` functions as a synchronization barrier - any warp
-executing this instruction will block until the previous kernel finishes
-execution. This allows precise control over data dependencies between kernels.
-
-The following diagram shows the overlapping execution of two dependent kernels.
-We call the instructions before ``griddepcontrol.wait`` as prologue (``P0``),
-which may include barrier initialization and loading of independent data, etc.
-We call the instructions after ``griddepcontrol.launch_dependents`` as epilogue
-(``P2``), which may include math operations, data stores, etc. PDL enables
-these prologue and epilogue phases to execute concurrently across dependent
-kernels, improving GPU resource utilization. This is particularly beneficial
-when prologue and epilogue are bound by different resources (e.g., memory
-bandwidth vs compute throughput).
-
-      # P0: Prologue, P1: Main compute, P2: Epilogue
-
-         P0    P1    P2
-  K1: |=====|+++++|-----|
-
-                  <-----> K2 can start early
-                         (K1's P2 overlaps with K2's P0)
-
-                     P0        P1    P2
-  K2:             |=====|   |+++++|-----|
-                          ^
-                          |
-                          wait for K1 to complete
-Time ------------------------------------------------------>
-
-We could run this example with and without PDL:
-
-.. code-block:: bash
-
-    python examples/blackwell/programmatic_dependent_launch.py --benchmark
-    python examples/blackwell/programmatic_dependent_launch.py --benchmark --use_pdl
-
-From the benchmark results, you can see some speedups for the PDL version in most cases, benefiting from
-the overlapping execution of consecutive kernels. Moreover, you can use nsys to observe the overlapping execution.
-
-.. code-block:: bash
-
-    nsys profile python examples/blackwell/programmatic_dependent_launch.py --benchmark --use_pdl
-
-Note, PDL feature is supported on Hopper and later GPUs.
-
-See [the programming guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programmatic-dependent-launch-and-synchronization)
-and the [PTX documentation](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol)
-for more details.
-"""
-
-
-@cute.kernel
-def elementwise_add_kernel(
-    gA: cute.Tensor,
-    gB: cute.Tensor,
-    gC: cute.Tensor,
-    cC: cute.Tensor,  # coordinate tensor
-    shape: cute.Shape,
-    thr_layout: cute.Layout,
-    val_layout: cute.Layout,
-    use_pdl: cutlass.Constexpr = True,
-    is_first_kernel: cutlass.Constexpr = True,
-):
-    tidx, _, _ = cute.arch.thread_idx()
-    bidx, _, _ = cute.arch.block_idx()
-
-    blk_coord = ((None, None), bidx)
-    blkA = gA[blk_coord]  # (TileM,TileN)
-    blkB = gB[blk_coord]  # (TileM,TileN)
-    blkC = gC[blk_coord]  # (TileM,TileN)
-    blkCrd = cC[blk_coord]  # (TileM, TileN)
-
-    copy_atom_load = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), gA.element_type)
-    copy_atom_store = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), gC.element_type)
-
-    tiled_copy_A = cute.make_tiled_copy_tv(copy_atom_load, thr_layout, val_layout)
-    tiled_copy_B = cute.make_tiled_copy_tv(copy_atom_load, thr_layout, val_layout)
-    tiled_copy_C = cute.make_tiled_copy_tv(copy_atom_store, thr_layout, val_layout)
-
-    thr_copy_A = tiled_copy_A.get_slice(tidx)
-    thr_copy_B = tiled_copy_B.get_slice(tidx)
-    thr_copy_C = tiled_copy_C.get_slice(tidx)
-
-    thrA = thr_copy_A.partition_S(blkA)
-    thrB = thr_copy_B.partition_S(blkB)
-    thrC = thr_copy_C.partition_S(blkC)
-
-    frgA = cute.make_fragment_like(thrA)
-    frgB = cute.make_fragment_like(thrB)
-    frgC = cute.make_fragment_like(thrC)
-
-    thrCrd = thr_copy_C.partition_S(blkCrd)
-    frgPred = cute.make_rmem_tensor(thrCrd.shape, cutlass.Boolean)
-
-    for i in range(cute.size(frgPred)):
-        val = cute.elem_less(thrCrd[i], shape)
-        frgPred[i] = val
-
-    # Note: when not using cuda-graph, the kernel execution may be blocked by the host overhead.
-    # In this case we won't see overlapping even when pdl is enabled.
-    # In this example, we add a loop (10 times) for all the copy and compute operations in the following code
-    # to make kernel running longer and make pdl benefits observable for both cuda-graph enabled and disabled cases.
-    if not use_pdl:
-        for _ in range(10):
-            cute.copy(copy_atom_load, thrA, frgA, pred=frgPred)
-            cute.copy(copy_atom_load, thrB, frgB, pred=frgPred)
-    else:
-        if is_first_kernel:
-            for _ in range(10):
-                cute.copy(copy_atom_load, thrA, frgA, pred=frgPred)
-                cute.copy(copy_atom_load, thrB, frgB, pred=frgPred)
-            # Here we add the launch dependents instruction for the first kernel as a hint to the runtime to early-launch
-            # the next kernel. If the next kernel becomes concurrent, we will have overlap where the second kernel
-            # can start reading x to ensure an E2E speedup. Note the placement of launch dependents has no implication
-            # on correctness, only performance.
-            cute.arch.griddepcontrol_launch_dependents()
-        else:
-            # In this example, the second kernel's second operand ``gB`` has no dependencies, its loading can overlap
-            # with the computation of ``gC`` from the first kernel.
-            for _ in range(10):
-                cute.copy(copy_atom_load, thrB, frgB, pred=frgPred)
-
-            # For the second kernel, its first operand ``gA`` is dependent on the previous kernel, we must call
-            # griddepcontrol.wait to assure correctness. This instruction will block until the prior kernels finishes
-            # and its memory operations are visible. Since gA is written by the prior kernel, this will block until gA
-            # is visible to our kernel. Without it, we would have undefined behavior due to a race condition.
-            cute.arch.griddepcontrol_wait()
-
-            for _ in range(10):
-                cute.copy(copy_atom_load, thrA, frgA, pred=frgPred)
-
-    for _ in range(10):
-        result = frgA.load() + frgB.load()
-        frgC.store(result)
-        cute.copy(copy_atom_store, frgC, thrC, pred=frgPred)
-
-
-@cute.jit
-def elementwise_add(
-    mA,
-    mB,
-    mC,
-    stream: cuda.CUstream,
-    use_pdl: cutlass.Constexpr = True,
-    is_first_kernel: cutlass.Constexpr = True,
-):
-    dtype = mA.element_type
-    # copy_bits for a thread is 128 bits, and we use 128 // dtype.width to get the vector size
-    vector_size = 128 // dtype.width
-
-    thr_layout = cute.make_ordered_layout((4, 32), order=(1, 0))
-    val_layout = cute.make_ordered_layout((4, vector_size), order=(1, 0))
-    tiler_mn, tv_layout = cute.make_layout_tv(thr_layout, val_layout)
-
-    gA = cute.zipped_divide(mA, tiler_mn)  # ((TileM,TileN),(RestM,RestN))
-    gB = cute.zipped_divide(mB, tiler_mn)  # ((TileM,TileN),(RestM,RestN))
-    gC = cute.zipped_divide(mC, tiler_mn)  # ((TileM,TileN),(RestM,RestN))
-
-    idC = cute.make_identity_tensor(mC.shape)
-    cC = cute.zipped_divide(idC, tiler=tiler_mn)
-
-    elementwise_add_kernel(
-        gA, gB, gC, cC, mC.shape, thr_layout, val_layout, use_pdl, is_first_kernel
-    ).launch(
-        grid=[cute.size(gC, mode=[1]), 1, 1],
-        block=[cute.size(tv_layout, mode=[0]), 1, 1],
-        # set cluster to enable cuLaunchKernelEx API for additional launch attributes setting
-        cluster=(1, 1, 1),
-        stream=stream,
-        # Currently, pdl launch attribute is set in compile phase,
-        # so we need to recompile the function if we change the value of use_pdl for multiple runs.
-        use_pdl=use_pdl,
-    )
-
-
-def run_pdl_example(
-    M,
-    N,
-    skip_ref_check=False,
-    benchmark=True,
-    warmup_iterations=5,
-    iterations=10,
-    use_pdl=True,
-):
-    if not torch.cuda.is_available():
-        raise RuntimeError("Blackwell/Hopper GPU is required to run this example!")
-
-    print("\nRunning Elementwise Add test with:")
-    print(f"Tensor dimensions: [{M}, {N}]")
-    print(f"Use PDL: {use_pdl}")
-
-    u = torch.randn(M, N, dtype=torch.float32, device="cuda")
-    v = torch.randn(M, N, dtype=torch.float32, device="cuda")
-    w = torch.randn(M, N, dtype=torch.float32, device="cuda")
-    x = torch.randn(M, N, dtype=torch.float32, device="cuda")
-    y = torch.empty(M, N, dtype=torch.float32, device="cuda")
-
-    u_tensor = from_dlpack(u).mark_layout_dynamic()
-    v_tensor = from_dlpack(v).mark_layout_dynamic()
-    w_tensor = from_dlpack(w).mark_layout_dynamic()
-    x_tensor = from_dlpack(x).mark_layout_dynamic()
-    y_tensor = from_dlpack(y).mark_layout_dynamic()
-
-    stream = torch.cuda.Stream()
-    current_stream = cuda.CUstream(stream.cuda_stream)
-    # Since use_pdl and is_first_kernel are cutlass.Constexpr, we need to compile for
-    # the first and second kernel separately.
-    compiled_func_first_kernel = cute.compile(
-        elementwise_add,
-        u_tensor,
-        v_tensor,
-        w_tensor,
-        current_stream,
-        use_pdl,
-        is_first_kernel=True,
-    )
-    compiled_func_second_kernel = cute.compile(
-        elementwise_add,
-        w_tensor,
-        x_tensor,
-        y_tensor,
-        current_stream,
-        use_pdl,
-        is_first_kernel=False,
-    )
-
-    # launch and run the two consecutive kernels in a same stream.
-    # Here, we simply use default stream.
-    def run_func(current_stream, u_tensor, v_tensor, w_tensor, x_tensor, y_tensor):
-        # Run first operation: w_tensor = u_tensor + v_tensor
-        compiled_func_first_kernel(
-            u_tensor,
-            v_tensor,
-            w_tensor,
-            current_stream,
-        )
-        # Run second operation: y_tensor = w_tensor + x_tensor
-        # its first operand ``w_tensor`` is the result of the first operation,
-        # they use the same memory space.
-        compiled_func_second_kernel(
-            w_tensor,
-            x_tensor,
-            y_tensor,
-            current_stream,
-        )
-
-    if not skip_ref_check:
-        run_func(current_stream, u_tensor, v_tensor, w_tensor, x_tensor, y_tensor)
-        print("Verifying results...")
-        torch.testing.assert_close(u.cpu() + v.cpu() + x.cpu(), y.cpu())
-        print("Results verified successfully!")
-
-    if not benchmark:
-        return
-
-    def generate_kernel_arguments():
-        u = torch.randn(M, N, dtype=torch.float32, device="cuda")
-        v = torch.randn(M, N, dtype=torch.float32, device="cuda")
-        w = torch.randn(M, N, dtype=torch.float32, device="cuda")
-        x = torch.randn(M, N, dtype=torch.float32, device="cuda")
-        y = torch.empty(M, N, dtype=torch.float32, device="cuda")
-
-        u_tensor = from_dlpack(u).mark_layout_dynamic()
-        v_tensor = from_dlpack(v).mark_layout_dynamic()
-        w_tensor = from_dlpack(w).mark_layout_dynamic()
-        x_tensor = from_dlpack(x).mark_layout_dynamic()
-        y_tensor = from_dlpack(y).mark_layout_dynamic()
-        return testing.JitArguments(
-            current_stream, u_tensor, v_tensor, w_tensor, x_tensor, y_tensor
-        )
-
-    avg_time_us = testing.benchmark(
-        run_func,
-        workspace_generator=generate_kernel_arguments,
-        workspace_count=10,
-        warmup_iterations=warmup_iterations,
-        iterations=iterations,
-        stream=current_stream,
-    )
-    print(f"Execution time: {avg_time_us:.4f} us")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="example of Programmatic Dependent Launch (PDL) using CuTe DSL"
-    )
-    parser.add_argument("--M", default=512, type=int)
-    parser.add_argument("--N", default=512, type=int)
-    parser.add_argument("--warmup_iterations", default=3, type=int)
-    parser.add_argument("--iterations", default=10, type=int)
-    parser.add_argument("--skip_ref_check", action="store_true")
-    parser.add_argument("--benchmark", action="store_true")
-    parser.add_argument("--use_pdl", action="store_true")
-
-    args = parser.parse_args()
-    if supports_pdl():
-        run_pdl_example(
-            args.M,
-            args.N,
-            skip_ref_check=args.skip_ref_check,
-            benchmark=args.benchmark,
-            warmup_iterations=args.warmup_iterations,
-            iterations=args.iterations,
-            use_pdl=args.use_pdl,
-        )
-        print("\nPASS")
-    else:
-        print(
-            "PDL is not supported on this device, it requires Hopper or newer generations"
-        )