[rocm-libraries] ROCm/rocm-libraries#5241 (commit 43daeac)

Changed the include order of the new WMMA/MFMA unification
 framework (#5241)

Those changes are to fix the include order and make header files
independent of one another. Also the `remod.py` sript has run and
changed the `grouped_convolution.hpp` and `core.hpp` files.

## Motivation

Some headers appear to depend on include order.
For example, when moving `#include "wmma/wmma.hpp"` in
[amdgcn_mma.hpp](https://github.com/ROCm/rocm-libraries/blob/develop/projects/composablekernel/include/ck_tile/core/arch/mma/amdgcn_mma.hpp)
later in the include list, it is causing compilation errors. Also the
pre-commit script `remod.py` is shuffling includes to be in alphabetical
order and is causing compilation issues.

Expected behaviour:
Headers should be independent of one another: no header should require
another to be included first. Each header should compile correctly on
its own.

## Test Plan

The CI (that runs `remod.py`) should compile.

## Test Result

Existing CI should compile and be green.

## Submission Checklist

- [x] Look over the contributing guidelines at
https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
This commit is contained in:
chris-tsiaousis-hpc
2026-03-12 08:27:49 +00:00
committed by assistant-librarian[bot]
parent 1a4aa7fd89
commit a1679e38ee
13 changed files with 140 additions and 115 deletions

View File

@@ -20,9 +20,18 @@
#include "ck_tile/core/arch/mma/mfma/mfma_traits.hpp"
#include "ck_tile/core/arch/mma/mfma/mfma_transforms.hpp"
#include "ck_tile/core/arch/mma/mma.hpp"
#include "ck_tile/core/arch/mma/mma_op_family.hpp"
#include "ck_tile/core/arch/mma/mma_selector.hpp"
#include "ck_tile/core/arch/mma/mma_traits.hpp"
#include "ck_tile/core/arch/mma/mma_transforms.hpp"
#include "ck_tile/core/arch/mma/sparse/mfma/selector.hpp"
#include "ck_tile/core/arch/mma/sparse/mfma/sparse_gfx9.hpp"
#include "ck_tile/core/arch/mma/sparse/sparse.hpp"
#include "ck_tile/core/arch/mma/sparse/sparse_selector.hpp"
#include "ck_tile/core/arch/mma/sparse/sparse_traits.hpp"
#include "ck_tile/core/arch/mma/sparse/sparse_transforms.hpp"
#include "ck_tile/core/arch/mma/sparse/wmma/selector.hpp"
#include "ck_tile/core/arch/mma/sparse/wmma/sparse_gfx12.hpp"
#include "ck_tile/core/arch/mma/utility/tile_distribution_encoding_register_mapper.hpp"
#include "ck_tile/core/arch/mma/wmma/wmma.hpp"
#include "ck_tile/core/arch/mma/wmma/wmma_gfx11.hpp"

View File

@@ -21,6 +21,7 @@ namespace ck_tile::core::arch::mma {
struct Unsupported;
#if CK_TILE_CONCEPTS && CK_TILE_CONCEPTS_HEADER
#include <concepts>
/**
* @concept MmaOpI

View File

@@ -20,34 +20,6 @@ namespace ck_tile::core::arch::mma {
// For flexibility, it is recommended that for each backend wrapper it supports at least
// one packed register for each input to be able to process smaller K values by padding.
/**
* @struct DefaultMmaCtrlFlags
* @brief Default MFMA flags, no broadcasting or rotation of inputs
*/
struct DefaultMfmaCtrlFlags
{
static constexpr uint32_t Cbsz = 0; // CBSZ flag, default 0
static constexpr uint32_t Abid = 0; // ABID flag, default 0
static constexpr uint32_t Blgp = 0; // BLGP flag, default 0
};
#if CK_TILE_CONCEPTS && CK_TILE_CONCEPTS_HEADER
#include <concepts>
/**
* @concept CtrlFlagsGfx9I
* @brief Expresses the interface of required members for each CtrlFlags type on Gfx9
*/
template <typename CtrlFlags>
concept CtrlFlagsGfx9I = requires(CtrlFlags ctrlFlags) {
// Flag members for Gfx9 MFMA instructions
{ CtrlFlags::Cbsz } -> std::convertible_to<int>;
{ CtrlFlags::Abid } -> std::convertible_to<int>;
{ CtrlFlags::Blgp } -> std::convertible_to<int>;
};
#endif // CK_TILE_CONCEPTS && CK_TILE_CONCEPTS_HEADER
/**
* @struct amdgcn_mma
* @brief Specialization of amdgcn_mma for MFMA on GFX9 targets

View File

@@ -41,4 +41,32 @@ struct is_mma_op_mfma<MmaOp, std::enable_if_t<std::is_same_v<typename MmaOp::OpT
template <typename MmaOp>
static constexpr bool is_mma_op_mfma_v = is_mma_op_mfma<MmaOp>::value;
/**
* @struct DefaultMfmaCtrlFlags
* @brief Default MFMA flags, no broadcasting or rotation of inputs
*/
struct DefaultMfmaCtrlFlags
{
static constexpr uint32_t Cbsz = 0; // CBSZ flag, default 0
static constexpr uint32_t Abid = 0; // ABID flag, default 0
static constexpr uint32_t Blgp = 0; // BLGP flag, default 0
};
#if CK_TILE_CONCEPTS && CK_TILE_CONCEPTS_HEADER
#include <concepts>
/**
* @concept CtrlFlagsGfx9I
* @brief Expresses the interface of required members for each CtrlFlags type on Gfx9
*/
template <typename CtrlFlags>
concept CtrlFlagsGfx9I = requires(CtrlFlags ctrlFlags) {
// Flag members for Gfx9 MFMA instructions
{ CtrlFlags::Cbsz } -> std::convertible_to<int>;
{ CtrlFlags::Abid } -> std::convertible_to<int>;
{ CtrlFlags::Blgp } -> std::convertible_to<int>;
};
#endif // CK_TILE_CONCEPTS && CK_TILE_CONCEPTS_HEADER
} // namespace ck_tile::core::arch::mma

View File

@@ -7,6 +7,7 @@
#include "ck_tile/core/arch/arch.hpp"
#include "mfma/mfma_traits.hpp"
#include "wmma/wmma_traits.hpp"
#include "sparse/sparse_traits.hpp"
namespace ck_tile::core::arch::mma {

View File

@@ -7,6 +7,7 @@
#include "ck_tile/core/arch/mma/amdgcn_mma.hpp"
#include "ck_tile/core/arch/mma/mma_selector.hpp"
#include "ck_tile/core/arch/mma/mma_traits.hpp"
#include "ck_tile/core/arch/mma/sparse/sparse_traits.hpp"
namespace ck_tile::core::arch::mma {

View File

@@ -7,34 +7,10 @@
#include "ck_tile/core/arch/arch.hpp"
#include "ck_tile/core/numeric/vector_type.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp"
#include "ck_tile/core/arch/mma/sparse/sparse_traits.hpp"
namespace ck_tile::core::arch::mma {
/**
* @struct DefaultSparseMfmaCtrlFlags
* @brief Default MFMA sparse flags, select (VGPR[srcC][7..0]) if srcC is
* 16-bit or (VGPR[srcC][15..0]) if srcC is 8-bit.
*/
struct DefaultSparseMfmaCtrlFlags
{
static constexpr SparseCompressionIndex CompressionIndex = SparseCompressionIndex::FIRST;
};
#if CK_TILE_CONCEPTS && CK_TILE_CONCEPTS_HEADER
#include <concepts>
/**
* @concept SparseMfmaCtrlFlags
* @brief Expresses the interface of required members for each CtrlFlags type
*/
template <typename CtrlFlags>
concept SparseMfmaCtrlFlags = requires(CtrlFlags ctrlFlags) {
// Flag members for sparse MFMA instructions
{ CtrlFlags::CompressionIndex } -> std::convertible_to<SparseCompressionIndex>;
};
#endif // CK_TILE_CONCEPTS && CK_TILE_CONCEPTS_HEADER
/**
* @struct amdgcn_mma
* @brief Specialization of amdgcn_mma for Sparse MFMA (SMFMA) on GFX942, GFX950 targets

View File

@@ -5,64 +5,11 @@
namespace ck_tile::core::arch::mma {
/**
* @enum SparseCompressionIndex
* @brief Indicates which set of sparse-indices within a VGPR starting at srcC
* containing 8-bits (for 16-bit source data) or 16-bits (for 8-bit source data)
* of index information for a lane. \see DefaultSparseMfmaCtrlFlags
*/
enum struct SparseCompressionIndex : int
{
FIRST = 0, // Uses bits [7:0] or [15..0], for 16 and 8 bit data respectively
SECOND = 1, // Uses bits [15:8] or [31:16], for 16 and 8 bit data respectively
THIRD = 2, // Uses bits [23:16]
FOURTH = 3, // Uses bits [31:24]
};
namespace sparse::detail {
/**
* @struct BuiltinParams
* @brief Translates the SparseCompressionIndex to the correct CBSZ and ABID pairs for sparse
* builtins. The actual behavior of the builtin depends on the input data type: 16-bit source data:
* If CBSZ=0, ABID selects one of four 8-bit sets of sparse-indices within a VGPR starting at srcC
* containing 8-bits of index information for a lane. If CBSZ!=0 the very first is selected
* (VGPR[srcC][7..0]).
*
* 8-bit source data:
* If CBSZ=0, ABID selects one of two 16-bit sets of sparse-indices within a VGPR starting at srcC
* containing 16-bits of index information for a lane. If CBSZ!=0; the very first is selected
* (VGPR[srcC][15..0]).
*/
struct BuiltinParams
{
int UseFirstIndex; // CBSZ
int ByteIndexToOverride; // ABID
};
template <SparseCompressionIndex Idx>
static constexpr BuiltinParams getBuiltinParams()
{
BuiltinParams params;
if constexpr(Idx == SparseCompressionIndex::FIRST)
{
params.UseFirstIndex = 1;
params.ByteIndexToOverride = 0;
}
else
{
params.UseFirstIndex = 0;
params.ByteIndexToOverride = static_cast<int>(Idx);
}
return params;
}
} // namespace sparse::detail
} // namespace ck_tile::core::arch::mma
// Include sparse MFMA traits and architecture-specific implementations
#include "ck_tile/core/arch/mma/sparse/mfma/sparse_gfx9.hpp"
#include "ck_tile/core/arch/mma/sparse/wmma/sparse_gfx12.hpp"
#include "ck_tile/core/arch/mma/sparse/sparse_transforms.hpp"
#include "ck_tile/core/arch/mma/sparse/sparse_traits.hpp"
#include "ck_tile/core/arch/mma/sparse/sparse_selector.hpp"

View File

@@ -0,0 +1,89 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
namespace ck_tile::core::arch::mma {
/**
* @enum SparseCompressionIndex
* @brief Indicates which set of sparse-indices within a VGPR starting at srcC
* containing 8-bits (for 16-bit source data) or 16-bits (for 8-bit source data)
* of index information for a lane. \see DefaultSparseMfmaCtrlFlags
*/
enum struct SparseCompressionIndex : int
{
FIRST = 0, // Uses bits [7:0] or [15..0], for 16 and 8 bit data respectively
SECOND = 1, // Uses bits [15:8] or [31:16], for 16 and 8 bit data respectively
THIRD = 2, // Uses bits [23:16]
FOURTH = 3, // Uses bits [31:24]
};
namespace sparse::detail {
/**
* @struct BuiltinParams
* @brief Translates the SparseCompressionIndex to the correct CBSZ and ABID pairs for sparse
* builtins. The actual behavior of the builtin depends on the input data type: 16-bit source data:
* If CBSZ=0, ABID selects one of four 8-bit sets of sparse-indices within a VGPR starting at srcC
* containing 8-bits of index information for a lane. If CBSZ!=0 the very first is selected
* (VGPR[srcC][7..0]).
*
* 8-bit source data:
* If CBSZ=0, ABID selects one of two 16-bit sets of sparse-indices within a VGPR starting at srcC
* containing 16-bits of index information for a lane. If CBSZ!=0; the very first is selected
* (VGPR[srcC][15..0]).
*/
struct BuiltinParams
{
int UseFirstIndex; // CBSZ
int ByteIndexToOverride; // ABID
};
template <SparseCompressionIndex Idx>
static constexpr BuiltinParams getBuiltinParams()
{
BuiltinParams params;
if constexpr(Idx == SparseCompressionIndex::FIRST)
{
params.UseFirstIndex = 1;
params.ByteIndexToOverride = 0;
}
else
{
params.UseFirstIndex = 0;
params.ByteIndexToOverride = static_cast<int>(Idx);
}
return params;
}
} // namespace sparse::detail
/**
* @struct DefaultSparseMfmaCtrlFlags
* @brief Default MFMA sparse flags, select (VGPR[srcC][7..0]) if srcC is
* 16-bit or (VGPR[srcC][15..0]) if srcC is 8-bit.
*/
struct DefaultSparseMfmaCtrlFlags
{
static constexpr SparseCompressionIndex CompressionIndex = SparseCompressionIndex::FIRST;
};
#if CK_TILE_CONCEPTS && CK_TILE_CONCEPTS_HEADER
#include <concepts>
/**
* @concept SparseMfmaCtrlFlags
* @brief Expresses the interface of required members for each CtrlFlags type
*/
template <typename CtrlFlags>
concept SparseMfmaCtrlFlags = requires(CtrlFlags ctrlFlags) {
// Flag members for sparse MFMA instructions
{ CtrlFlags::CompressionIndex } -> std::convertible_to<SparseCompressionIndex>;
};
#endif // CK_TILE_CONCEPTS && CK_TILE_CONCEPTS_HEADER
struct DefaultSparseWmmaCtrlFlags
{
};
} // namespace ck_tile::core::arch::mma

View File

@@ -7,13 +7,11 @@
#include "ck_tile/core/arch/arch.hpp"
#include "ck_tile/core/arch/mma/amdgcn_mma.hpp"
#include "ck_tile/core/numeric/vector_type.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp"
#include "ck_tile/core/arch/mma/sparse/sparse_traits.hpp"
namespace ck_tile::core::arch::mma {
struct DefaultSparseWmmaCtrlFlags
{
};
// TODO: c++20 template <CtrlFlagsSparseWmmaI CtrlFlags, amdgcn_target CompilerTarget>
// TODO: c++20 requires
template <typename CtrlFlags, typename CompilerTarget>
@@ -61,7 +59,7 @@ struct amdgcn_mma<fp16_t,
// TODO: Compressing A on-the-fly should be OK for now, but we need to validate
// and evaluate changing this to a transform at a higher level.
// aVec not being const can cause problems when running multiple intrinsics.
const int32_t idx = ck_tile::compress_a_impl<fp16_t, CompressedSize>(aVec);
const int32_t idx = ::ck_tile::compress_a_impl<fp16_t, CompressedSize>(aVec);
const AVecCompressed a_vec_pruned = {
aVec[0], aVec[1], aVec[2], aVec[3], aVec[4], aVec[5], aVec[6], aVec[7]};

View File

@@ -3,7 +3,6 @@
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/core/numeric/integer.hpp"
#include "ck_tile/core/tensor/static_distributed_tensor.hpp"
namespace ck_tile {

View File

@@ -1,9 +1,13 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/ops/gemm_mx/kernel/gemm_mx_kernel.hpp"
#include "ck_tile/ops/gemm_mx/kernel/scale_pointer.hpp"
#include "ck_tile/ops/gemm_mx/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp"
#include "ck_tile/ops/gemm_mx/pipeline/gemm_pipeline_ag_bg_cr_comp_async_default_policy.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/load_and_convert_tile.hpp"
#include "ck_tile/ops/common/streamk_common.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"

View File

@@ -2,10 +2,10 @@
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/ops/grouped_convolution/pipeline/grouped_conv_universal_pipeline_ag_bg_cr_policy.hpp"
#include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp"
#include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp"
#include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp"
#include "ck_tile/ops/grouped_convolution/pipeline/grouped_conv_universal_pipeline_ag_bg_cr_policy.hpp"
#include "ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp"
#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp"
#include "ck_tile/ops/grouped_convolution/utils/split_k_utils.hpp"