[rocm-libraries] ROCm/rocm-libraries#4451 (commit 091bf0f)

[CK_TILE] Blockscale Gemm Fix Multi-Arch Compilation ## Motivation This PR updates CK_TILE blockscale GEMM-quant kernels and launch helpers to compile across multiple GPU architectures by introducing compile-time availability gating and a new attribute tag mechanism for kernel symbol/attribute specialization. ## Technical Details - Add an architecture-guarded `kIsAvailable` flag to the gfx950 pipeline and propagate availability handling into `QuantGemmKernel`. - Extend `make_kernel`/`kentry` to accept an `Attr` tag enabling per-kernel compile-time attributes (e.g., `no-packed-fp32-ops`) and unique symbols. - Update the blockscale GEMM quant example to pass kernel attributes and adjust gfx950 gating. ## Test Plan - CI - Local test: `cmake .. --preset dev -DGPU_TARGETS='gfx942;gfx950' -GNinja && ninja tile_example_gemm_quant` - Local test with ROCm/aiter#1954 ## Test Result  ## Submission Checklist - [x] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
2026-04-19 22:39:03 +00:00 · 2026-02-10 12:42:19 +00:00
parent 6a6cd05dbb
commit d5acfd8d52
6 changed files with 90 additions and 57 deletions
--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
@@ -15,37 +15,57 @@

 namespace ck_tile {

-template <int MinBlockPerCu, typename Kernel, typename... Args>
-#if CK_TILE_USE_LAUNCH_BOUNDS
-__launch_bounds__(Kernel::kBlockSize, MinBlockPerCu)
-#endif
-#if defined(__HIP_DEVICE_COMPILE__) && defined(CK_TILE_EIGHTWARP_SUP)
-    __attribute__((target("no-packed-fp32-ops")))
-#endif
-    __global__ void kentry(Args... args)
+template <typename T, typename = void>
+inline constexpr bool kattr_no_packed_fp32_ops_v = false;
+template <typename T>
+inline constexpr bool
+    kattr_no_packed_fp32_ops_v<T, std::void_t<decltype(T::kattr_no_packed_fp32_ops)>> =
+        T::kattr_no_packed_fp32_ops;
+
+template <bool no_packed_fp32_ops>
+struct kernel_attr
 {
-#if defined(__HIP_DEVICE_COMPILE__)
-    Kernel{}(args...);
+    // The kernel function attribute "no-packed-fp32-ops": Disable the use of packed FP32
+    // instructions so that they can be co-executed with matrix operations
+    static constexpr bool kattr_no_packed_fp32_ops = no_packed_fp32_ops;
+};
+
+#if CK_TILE_USE_LAUNCH_BOUNDS
+#define KENTRY_LAUNCH_BOUNDS __launch_bounds__(Kernel::kBlockSize, MinBlockPerCu)
 #else
-    (..., (ignore = args, 0));
+#define KENTRY_LAUNCH_BOUNDS
 #endif
+#if defined(__HIP_DEVICE_COMPILE__)
+#define KENTRY_BODY Kernel{}(args...)
+#define KENTRY_ATTR_NO_PACKED_FP32_OPS __attribute__((target("no-packed-fp32-ops")))
+#else
+#define KENTRY_BODY (..., (ignore = args, 0))
+#define KENTRY_ATTR_NO_PACKED_FP32_OPS
+#endif
+
+template <int MinBlockPerCu, typename Kernel, typename... Args>
+KENTRY_LAUNCH_BOUNDS __global__ void kentry(Args... args)
+{
+    KENTRY_BODY;
+}
+template <typename Attr, int MinBlockPerCu, typename Kernel, typename... Args>
+KENTRY_LAUNCH_BOUNDS __global__ //
+    std::enable_if_t<!kattr_no_packed_fp32_ops_v<Attr>>
+    kentry(Args... args)
+{
+    KENTRY_BODY;
+}
+template <typename Attr, int MinBlockPerCu, typename Kernel, typename... Args>
+KENTRY_LAUNCH_BOUNDS KENTRY_ATTR_NO_PACKED_FP32_OPS __global__ //
+    std::enable_if_t<kattr_no_packed_fp32_ops_v<Attr>>
+    kentry(Args... args)
+{
+    KENTRY_BODY;
 }

-template <typename Arch, int MinBlockPerCu, typename Kernel, typename... Args>
-#if CK_TILE_USE_LAUNCH_BOUNDS
-__launch_bounds__(Kernel::kBlockSize, MinBlockPerCu)
-#endif
-#if defined(__HIP_DEVICE_COMPILE__) && defined(CK_TILE_EIGHTWARP_SUP)
-    __attribute__((target("no-packed-fp32-ops")))
-#endif
-    __global__ void kentry(Args... args)
-{
-#if defined(__HIP_DEVICE_COMPILE__)
-    Kernel{}(args...);
-#else
-    (..., (ignore = args, 0));
-#endif
-}
+#undef KENTRY_LAUNCH_BOUNDS
+#undef KENTRY_BODY
+#undef KENTRY_ATTR_NO_PACKED_FP32_OPS

 //
 // return a anonymous functor(lambda) to be called later
@@ -54,26 +74,22 @@ __launch_bounds__(Kernel::kBlockSize, MinBlockPerCu)
 //
 // the "static __device__ operator()(some_arg)" is the entry point of KernelImpl
 //
-// Arch can be used to support linking multiple object files that have the same kernel compiled for
+// Attr can be used to support linking multiple object files that have the same kernel compiled for
 // different architectures. In this case each object file has to use a different tag (gfx9_t,
-// gfx12_t etc.), so the kernel will have different symbols for each architecture.
-//
+// gfx12_t etc.), so the kernel will have different symbols for each architecture. It can also be
+// used to pass some compile-time attributes to the kernel.
 template <int MinBlockPerCu = CK_TILE_MIN_BLOCK_PER_CU,
-          typename Arch     = void,
+          typename Attr     = void,
          typename KernelImpl,
          typename... Args>
 CK_TILE_HOST auto
 make_kernel(KernelImpl /*f*/, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
 {
    const auto kernel = []() {
-        if constexpr(std::is_void_v<Arch>)
-        {
+        if constexpr(std::is_void_v<Attr>)
            return kentry<MinBlockPerCu, KernelImpl, Args...>;
-        }
        else
-        {
-            return kentry<Arch, MinBlockPerCu, KernelImpl, Args...>;
-        }
+            return kentry<Attr, MinBlockPerCu, KernelImpl, Args...>;
    }();
    return [=](const stream_config& s) {
        kernel<<<grid_dim, block_dim, lds_byte, s.stream_id_>>>(args...);