[rocm-libraries] ROCm/rocm-libraries#5241 (commit 43daeac)

Changed the include order of the new WMMA/MFMA unification framework (#5241) Those changes are to fix the include order and make header files independent of one another. Also the `remod.py` sript has run and changed the `grouped_convolution.hpp` and `core.hpp` files. ## Motivation Some headers appear to depend on include order. For example, when moving `#include "wmma/wmma.hpp"` in [amdgcn_mma.hpp](https://github.com/ROCm/rocm-libraries/blob/develop/projects/composablekernel/include/ck_tile/core/arch/mma/amdgcn_mma.hpp) later in the include list, it is causing compilation errors. Also the pre-commit script `remod.py` is shuffling includes to be in alphabetical order and is causing compilation issues. Expected behaviour: Headers should be independent of one another: no header should require another to be included first. Each header should compile correctly on its own. ## Test Plan The CI (that runs `remod.py`) should compile. ## Test Result Existing CI should compile and be green. ## Submission Checklist - [x] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
2026-04-19 22:39:03 +00:00 · 2026-03-12 08:27:49 +00:00
parent 1a4aa7fd89
commit a1679e38ee
13 changed files with 140 additions and 115 deletions
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -20,9 +20,18 @@
 #include "ck_tile/core/arch/mma/mfma/mfma_traits.hpp"
 #include "ck_tile/core/arch/mma/mfma/mfma_transforms.hpp"
 #include "ck_tile/core/arch/mma/mma.hpp"
+#include "ck_tile/core/arch/mma/mma_op_family.hpp"
 #include "ck_tile/core/arch/mma/mma_selector.hpp"
 #include "ck_tile/core/arch/mma/mma_traits.hpp"
 #include "ck_tile/core/arch/mma/mma_transforms.hpp"
+#include "ck_tile/core/arch/mma/sparse/mfma/selector.hpp"
+#include "ck_tile/core/arch/mma/sparse/mfma/sparse_gfx9.hpp"
+#include "ck_tile/core/arch/mma/sparse/sparse.hpp"
+#include "ck_tile/core/arch/mma/sparse/sparse_selector.hpp"
+#include "ck_tile/core/arch/mma/sparse/sparse_traits.hpp"
+#include "ck_tile/core/arch/mma/sparse/sparse_transforms.hpp"
+#include "ck_tile/core/arch/mma/sparse/wmma/selector.hpp"
+#include "ck_tile/core/arch/mma/sparse/wmma/sparse_gfx12.hpp"
 #include "ck_tile/core/arch/mma/utility/tile_distribution_encoding_register_mapper.hpp"
 #include "ck_tile/core/arch/mma/wmma/wmma.hpp"
 #include "ck_tile/core/arch/mma/wmma/wmma_gfx11.hpp"
--- a/include/ck_tile/core/arch/mma/amdgcn_mma.hpp
+++ b/include/ck_tile/core/arch/mma/amdgcn_mma.hpp
@@ -21,6 +21,7 @@ namespace ck_tile::core::arch::mma {
 struct Unsupported;

 #if CK_TILE_CONCEPTS && CK_TILE_CONCEPTS_HEADER
+
 #include <concepts>
 /**
 * @concept MmaOpI
--- a/include/ck_tile/core/arch/mma/mfma/mfma_gfx9.hpp
+++ b/include/ck_tile/core/arch/mma/mfma/mfma_gfx9.hpp
@@ -20,34 +20,6 @@ namespace ck_tile::core::arch::mma {
 // For flexibility, it is recommended that for each backend wrapper it supports at least
 // one packed register for each input to be able to process smaller K values by padding.

-/**
- * @struct DefaultMmaCtrlFlags
- * @brief Default MFMA flags, no broadcasting or rotation of inputs
- */
-struct DefaultMfmaCtrlFlags
-{
-    static constexpr uint32_t Cbsz = 0; // CBSZ flag, default 0
-    static constexpr uint32_t Abid = 0; // ABID flag, default 0
-    static constexpr uint32_t Blgp = 0; // BLGP flag, default 0
-};
-
-#if CK_TILE_CONCEPTS && CK_TILE_CONCEPTS_HEADER
-#include <concepts>
-
-/**
- * @concept CtrlFlagsGfx9I
- * @brief  Expresses the interface of required members for each CtrlFlags type on Gfx9
- */
-template <typename CtrlFlags>
-concept CtrlFlagsGfx9I = requires(CtrlFlags ctrlFlags) {
-    // Flag members for Gfx9 MFMA instructions
-    { CtrlFlags::Cbsz } -> std::convertible_to<int>;
-    { CtrlFlags::Abid } -> std::convertible_to<int>;
-    { CtrlFlags::Blgp } -> std::convertible_to<int>;
-};
-
-#endif // CK_TILE_CONCEPTS && CK_TILE_CONCEPTS_HEADER
-
 /**
 * @struct amdgcn_mma
 * @brief Specialization of amdgcn_mma for MFMA on GFX9 targets
--- a/include/ck_tile/core/arch/mma/mfma/mfma_traits.hpp
+++ b/include/ck_tile/core/arch/mma/mfma/mfma_traits.hpp
@@ -41,4 +41,32 @@ struct is_mma_op_mfma<MmaOp, std::enable_if_t<std::is_same_v<typename MmaOp::OpT
 template <typename MmaOp>
 static constexpr bool is_mma_op_mfma_v = is_mma_op_mfma<MmaOp>::value;

+/**
+ * @struct DefaultMfmaCtrlFlags
+ * @brief Default MFMA flags, no broadcasting or rotation of inputs
+ */
+struct DefaultMfmaCtrlFlags
+{
+    static constexpr uint32_t Cbsz = 0; // CBSZ flag, default 0
+    static constexpr uint32_t Abid = 0; // ABID flag, default 0
+    static constexpr uint32_t Blgp = 0; // BLGP flag, default 0
+};
+
+#if CK_TILE_CONCEPTS && CK_TILE_CONCEPTS_HEADER
+#include <concepts>
+
+/**
+ * @concept CtrlFlagsGfx9I
+ * @brief  Expresses the interface of required members for each CtrlFlags type on Gfx9
+ */
+template <typename CtrlFlags>
+concept CtrlFlagsGfx9I = requires(CtrlFlags ctrlFlags) {
+    // Flag members for Gfx9 MFMA instructions
+    { CtrlFlags::Cbsz } -> std::convertible_to<int>;
+    { CtrlFlags::Abid } -> std::convertible_to<int>;
+    { CtrlFlags::Blgp } -> std::convertible_to<int>;
+};
+
+#endif // CK_TILE_CONCEPTS && CK_TILE_CONCEPTS_HEADER
+
 } // namespace ck_tile::core::arch::mma
--- a/include/ck_tile/core/arch/mma/mma_traits.hpp
+++ b/include/ck_tile/core/arch/mma/mma_traits.hpp
@@ -7,6 +7,7 @@
 #include "ck_tile/core/arch/arch.hpp"
 #include "mfma/mfma_traits.hpp"
 #include "wmma/wmma_traits.hpp"
+#include "sparse/sparse_traits.hpp"

 namespace ck_tile::core::arch::mma {

--- a/include/ck_tile/core/arch/mma/sparse/mfma/selector.hpp
+++ b/include/ck_tile/core/arch/mma/sparse/mfma/selector.hpp
@@ -7,6 +7,7 @@
 #include "ck_tile/core/arch/mma/amdgcn_mma.hpp"
 #include "ck_tile/core/arch/mma/mma_selector.hpp"
 #include "ck_tile/core/arch/mma/mma_traits.hpp"
+#include "ck_tile/core/arch/mma/sparse/sparse_traits.hpp"

 namespace ck_tile::core::arch::mma {

--- a/include/ck_tile/core/arch/mma/sparse/mfma/sparse_gfx9.hpp
+++ b/include/ck_tile/core/arch/mma/sparse/mfma/sparse_gfx9.hpp
@@ -7,34 +7,10 @@
 #include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/core/numeric/vector_type.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp"
+#include "ck_tile/core/arch/mma/sparse/sparse_traits.hpp"

 namespace ck_tile::core::arch::mma {

-/**
- * @struct DefaultSparseMfmaCtrlFlags
- * @brief Default MFMA sparse flags, select (VGPR[srcC][7..0]) if srcC is
- * 16-bit or (VGPR[srcC][15..0]) if srcC is 8-bit.
- */
-struct DefaultSparseMfmaCtrlFlags
-{
-    static constexpr SparseCompressionIndex CompressionIndex = SparseCompressionIndex::FIRST;
-};
-
-#if CK_TILE_CONCEPTS && CK_TILE_CONCEPTS_HEADER
-#include <concepts>
-
-/**
- * @concept SparseMfmaCtrlFlags
- * @brief Expresses the interface of required members for each CtrlFlags type
- */
-template <typename CtrlFlags>
-concept SparseMfmaCtrlFlags = requires(CtrlFlags ctrlFlags) {
-    // Flag members for sparse MFMA instructions
-    { CtrlFlags::CompressionIndex } -> std::convertible_to<SparseCompressionIndex>;
-};
-
-#endif // CK_TILE_CONCEPTS && CK_TILE_CONCEPTS_HEADER
-
 /**
 * @struct amdgcn_mma
 * @brief Specialization of amdgcn_mma for Sparse MFMA (SMFMA) on GFX942, GFX950 targets
--- a/include/ck_tile/core/arch/mma/sparse/sparse.hpp
+++ b/include/ck_tile/core/arch/mma/sparse/sparse.hpp
@@ -5,64 +5,11 @@

 namespace ck_tile::core::arch::mma {

-/**
- * @enum SparseCompressionIndex
- * @brief Indicates which set of sparse-indices within a VGPR starting at srcC
- * containing 8-bits (for 16-bit source data) or 16-bits (for 8-bit source data)
- * of index information for a lane. \see DefaultSparseMfmaCtrlFlags
- */
-enum struct SparseCompressionIndex : int
-{
-    FIRST  = 0, // Uses bits  [7:0] or [15..0], for 16 and 8 bit data respectively
-    SECOND = 1, // Uses bits [15:8] or [31:16], for 16 and 8 bit data respectively
-    THIRD  = 2, // Uses bits [23:16]
-    FOURTH = 3, // Uses bits [31:24]
-};
-
-namespace sparse::detail {
-
-/**
- * @struct BuiltinParams
- * @brief Translates the SparseCompressionIndex to the correct CBSZ and ABID pairs for sparse
- * builtins. The actual behavior of the builtin depends on the input data type: 16-bit source data:
- * If CBSZ=0, ABID selects one of four 8-bit sets of sparse-indices within a VGPR starting at srcC
- * containing 8-bits of index information for a lane. If CBSZ!=0 the very first is selected
- * (VGPR[srcC][7..0]).
- *
- * 8-bit source data:
- * If CBSZ=0, ABID selects one of two 16-bit sets of sparse-indices within a VGPR starting at srcC
- * containing 16-bits of index information for a lane. If CBSZ!=0; the very first is selected
- * (VGPR[srcC][15..0]).
- */
-struct BuiltinParams
-{
-    int UseFirstIndex;       // CBSZ
-    int ByteIndexToOverride; // ABID
-};
-
-template <SparseCompressionIndex Idx>
-static constexpr BuiltinParams getBuiltinParams()
-{
-    BuiltinParams params;
-    if constexpr(Idx == SparseCompressionIndex::FIRST)
-    {
-        params.UseFirstIndex       = 1;
-        params.ByteIndexToOverride = 0;
-    }
-    else
-    {
-        params.UseFirstIndex       = 0;
-        params.ByteIndexToOverride = static_cast<int>(Idx);
-    }
-    return params;
-}
-
-} // namespace sparse::detail
-
 } // namespace ck_tile::core::arch::mma

 // Include sparse MFMA traits and architecture-specific implementations
 #include "ck_tile/core/arch/mma/sparse/mfma/sparse_gfx9.hpp"
 #include "ck_tile/core/arch/mma/sparse/wmma/sparse_gfx12.hpp"
 #include "ck_tile/core/arch/mma/sparse/sparse_transforms.hpp"
+#include "ck_tile/core/arch/mma/sparse/sparse_traits.hpp"
 #include "ck_tile/core/arch/mma/sparse/sparse_selector.hpp"
--- a/include/ck_tile/core/arch/mma/sparse/sparse_traits.hpp
+++ b/include/ck_tile/core/arch/mma/sparse/sparse_traits.hpp
@@ -0,0 +1,89 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+namespace ck_tile::core::arch::mma {
+
+/**
+ * @enum SparseCompressionIndex
+ * @brief Indicates which set of sparse-indices within a VGPR starting at srcC
+ * containing 8-bits (for 16-bit source data) or 16-bits (for 8-bit source data)
+ * of index information for a lane. \see DefaultSparseMfmaCtrlFlags
+ */
+enum struct SparseCompressionIndex : int
+{
+    FIRST  = 0, // Uses bits  [7:0] or [15..0], for 16 and 8 bit data respectively
+    SECOND = 1, // Uses bits [15:8] or [31:16], for 16 and 8 bit data respectively
+    THIRD  = 2, // Uses bits [23:16]
+    FOURTH = 3, // Uses bits [31:24]
+};
+
+namespace sparse::detail {
+
+/**
+ * @struct BuiltinParams
+ * @brief Translates the SparseCompressionIndex to the correct CBSZ and ABID pairs for sparse
+ * builtins. The actual behavior of the builtin depends on the input data type: 16-bit source data:
+ * If CBSZ=0, ABID selects one of four 8-bit sets of sparse-indices within a VGPR starting at srcC
+ * containing 8-bits of index information for a lane. If CBSZ!=0 the very first is selected
+ * (VGPR[srcC][7..0]).
+ *
+ * 8-bit source data:
+ * If CBSZ=0, ABID selects one of two 16-bit sets of sparse-indices within a VGPR starting at srcC
+ * containing 16-bits of index information for a lane. If CBSZ!=0; the very first is selected
+ * (VGPR[srcC][15..0]).
+ */
+struct BuiltinParams
+{
+    int UseFirstIndex;       // CBSZ
+    int ByteIndexToOverride; // ABID
+};
+
+template <SparseCompressionIndex Idx>
+static constexpr BuiltinParams getBuiltinParams()
+{
+    BuiltinParams params;
+    if constexpr(Idx == SparseCompressionIndex::FIRST)
+    {
+        params.UseFirstIndex       = 1;
+        params.ByteIndexToOverride = 0;
+    }
+    else
+    {
+        params.UseFirstIndex       = 0;
+        params.ByteIndexToOverride = static_cast<int>(Idx);
+    }
+    return params;
+}
+
+} // namespace sparse::detail
+
+/**
+ * @struct DefaultSparseMfmaCtrlFlags
+ * @brief Default MFMA sparse flags, select (VGPR[srcC][7..0]) if srcC is
+ * 16-bit or (VGPR[srcC][15..0]) if srcC is 8-bit.
+ */
+struct DefaultSparseMfmaCtrlFlags
+{
+    static constexpr SparseCompressionIndex CompressionIndex = SparseCompressionIndex::FIRST;
+};
+
+#if CK_TILE_CONCEPTS && CK_TILE_CONCEPTS_HEADER
+#include <concepts>
+/**
+ * @concept SparseMfmaCtrlFlags
+ * @brief Expresses the interface of required members for each CtrlFlags type
+ */
+template <typename CtrlFlags>
+concept SparseMfmaCtrlFlags = requires(CtrlFlags ctrlFlags) {
+    // Flag members for sparse MFMA instructions
+    { CtrlFlags::CompressionIndex } -> std::convertible_to<SparseCompressionIndex>;
+};
+#endif // CK_TILE_CONCEPTS && CK_TILE_CONCEPTS_HEADER
+
+struct DefaultSparseWmmaCtrlFlags
+{
+};
+
+} // namespace ck_tile::core::arch::mma
--- a/include/ck_tile/core/arch/mma/sparse/wmma/sparse_gfx12.hpp
+++ b/include/ck_tile/core/arch/mma/sparse/wmma/sparse_gfx12.hpp
@@ -7,13 +7,11 @@
 #include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/core/arch/mma/amdgcn_mma.hpp"
 #include "ck_tile/core/numeric/vector_type.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp"
+#include "ck_tile/core/arch/mma/sparse/sparse_traits.hpp"

 namespace ck_tile::core::arch::mma {

-struct DefaultSparseWmmaCtrlFlags
-{
-};
-
 // TODO: c++20 template <CtrlFlagsSparseWmmaI CtrlFlags, amdgcn_target CompilerTarget>
 // TODO: c++20 requires
 template <typename CtrlFlags, typename CompilerTarget>
@@ -61,7 +59,7 @@ struct amdgcn_mma<fp16_t,
        // TODO: Compressing A on-the-fly should be OK for now, but we need to validate
        // and evaluate changing this to a transform at a higher level.
        // aVec not being const can cause problems when running multiple intrinsics.
-        const int32_t idx = ck_tile::compress_a_impl<fp16_t, CompressedSize>(aVec);
+        const int32_t idx = ::ck_tile::compress_a_impl<fp16_t, CompressedSize>(aVec);

        const AVecCompressed a_vec_pruned = {
            aVec[0], aVec[1], aVec[2], aVec[3], aVec[4], aVec[5], aVec[6], aVec[7]};
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp
@@ -3,7 +3,6 @@

 #pragma once

-#include "ck_tile/core.hpp"
 #include "ck_tile/core/numeric/integer.hpp"
 #include "ck_tile/core/tensor/static_distributed_tensor.hpp"
 namespace ck_tile {
--- a/include/ck_tile/ops/gemm_mx.hpp
+++ b/include/ck_tile/ops/gemm_mx.hpp
@@ -1,9 +1,13 @@
 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-
 #pragma once

 #include "ck_tile/ops/gemm_mx/kernel/gemm_mx_kernel.hpp"
 #include "ck_tile/ops/gemm_mx/kernel/scale_pointer.hpp"
 #include "ck_tile/ops/gemm_mx/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp"
 #include "ck_tile/ops/gemm_mx/pipeline/gemm_pipeline_ag_bg_cr_comp_async_default_policy.hpp"
+#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
+#include "ck_tile/ops/common/load_and_convert_tile.hpp"
+#include "ck_tile/ops/common/streamk_common.hpp"
+#include "ck_tile/ops/common/tensor_layout.hpp"
+#include "ck_tile/ops/common/utils.hpp"
--- a/include/ck_tile/ops/grouped_convolution.hpp
+++ b/include/ck_tile/ops/grouped_convolution.hpp
@@ -2,10 +2,10 @@
 // SPDX-License-Identifier: MIT
 #pragma once

-#include "ck_tile/ops/grouped_convolution/pipeline/grouped_conv_universal_pipeline_ag_bg_cr_policy.hpp"
 #include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp"
 #include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp"
 #include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp"
+#include "ck_tile/ops/grouped_convolution/pipeline/grouped_conv_universal_pipeline_ag_bg_cr_policy.hpp"
 #include "ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp"
 #include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp"
 #include "ck_tile/ops/grouped_convolution/utils/split_k_utils.hpp"