diff --git a/example/ck_tile/51_tile_distr_enc_reg_map/CMakeLists.txt b/example/ck_tile/51_tile_distr_enc_reg_map/CMakeLists.txt index 59352336ce..88cf9e4eb5 100644 --- a/example/ck_tile/51_tile_distr_enc_reg_map/CMakeLists.txt +++ b/example/ck_tile/51_tile_distr_enc_reg_map/CMakeLists.txt @@ -2,3 +2,4 @@ # SPDX-License-Identifier: MIT add_executable(tile_example_tile_distr_enc_reg_map example_tile_distr_enc_reg_map.cpp) +add_executable(tile_example_tile_distr_enc_calc example_tile_distr_enc_calc.cpp) diff --git a/example/ck_tile/51_tile_distr_enc_reg_map/example_tile_distr_enc_calc.cpp b/example/ck_tile/51_tile_distr_enc_reg_map/example_tile_distr_enc_calc.cpp new file mode 100644 index 0000000000..6de7af2cbd --- /dev/null +++ b/example/ck_tile/51_tile_distr_enc_reg_map/example_tile_distr_enc_calc.cpp @@ -0,0 +1,93 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include +#include +#include +#include "ck_tile/core/arch/arch.hpp" +#include "ck_tile/core/arch/mma/amdgcn_mma.hpp" +#include "ck_tile/core/arch/mma/utility/tile_distribution_encoding_register_mapper.hpp" +#include "ck_tile/core/arch/mma/utility/tile_distribution_encoding_calculator.hpp" +#include "ck_tile/core/container/tuple.hpp" + +using namespace ck_tile; +using namespace ck_tile::core::arch; +using namespace mma; +using F16 = fp16_t; +using F32 = fp32_t; +using Target908 = decltype(make_amdgcn_gfx9_target()); +using Target950 = decltype(make_amdgcn_gfx9_target()); +using Target11 = decltype(make_amdgcn_gfx11_target()); +using Target12 = decltype(make_amdgcn_gfx12_target()); + +template +int check_tile_distr_enc() +{ + using AEnc = typename TileDistrEncCalc::AWarpDstrEncoding; + using BEnc = typename TileDistrEncCalc::BWarpDstrEncoding; + using CEnc = typename TileDistrEncCalc::CWarpDstrEncoding; + + TileDistrEncRegMap::print(); + TileDistrEncRegMap::print(); + TileDistrEncRegMap::print(); + + // The only thing we check here is that CTranspose works as expected. + using AEncTransp = typename TileDistrEncCalc::AWarpDstrEncoding; + using BEncTransp = typename TileDistrEncCalc::BWarpDstrEncoding; + using CEncTransp = typename TileDistrEncCalc::CWarpDstrEncoding; + + // When using TransposeC, the A and B matrix layouts should be swapped. + static_assert(std::is_same()); + static_assert(std::is_same()); + + // Make sure the C matrix layout is transposed in the CTranspose case. + int err = 0; + for(index_t lane = 0; lane < TileDistrEncRegMap::num_lanes; lane++) + { + for(index_t vec = 0; vec < TileDistrEncRegMap::num_vector_items; vec++) + { + auto coords = TileDistrEncRegMap::calc_matrix_indices_from_lane_vector(lane, vec); + auto coords_transp = + TileDistrEncRegMap::calc_matrix_indices_from_lane_vector(lane, vec); + + if(coords[0] != coords_transp[1] || coords[1] != coords_transp[0]) + { + err = 1; + printf("\033[31mLane %2d vec %2d maps to C matrix coords %2d %2d and transposed C " + "matrix coords %2d %2d, inconsistent!\033[0m\n", + lane, + vec, + coords[0], + coords[1], + coords_transp[0], + coords_transp[1]); + } + } + } + + return err; +} + +// List of intrinsics to test. +// clang-format off +using Intrinsics = ck_tile::tuple< + amdgcn_mma, // mfma_f32_16x16x16f16 + amdgcn_mma, // mfma_f32_32x32x4f16 + amdgcn_mma, // mfma_f32_32x32x4f16 + amdgcn_mma, // mfma_f32_4x4x4f16 + amdgcn_mma, // mfma_f32_4x4x4f16 + amdgcn_mma, // mfma_f32_16x16x32_f16 + amdgcn_mma, Target11, MmaOpFamily::DENSE>, // wmma_f32_16x16x16_f16_w32 + amdgcn_mma, Target12, MmaOpFamily::DENSE> // wmma_f32_16x16x16_f16_w32_gfx12 +>; +// clang-format on + +int main() +{ + int err = 0; + static_for<0, Intrinsics::size(), 1>{}([&](auto i) { + using MmaOp = std::tuple_element_t; + err |= check_tile_distr_enc(); + }); + return err; +} diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp index e558502563..45c0e302e5 100644 --- a/include/ck_tile/core.hpp +++ b/include/ck_tile/core.hpp @@ -32,6 +32,7 @@ #include "ck_tile/core/arch/mma/sparse/sparse_transforms.hpp" #include "ck_tile/core/arch/mma/sparse/wmma/selector.hpp" #include "ck_tile/core/arch/mma/sparse/wmma/sparse_gfx12.hpp" +#include "ck_tile/core/arch/mma/utility/tile_distribution_encoding_calculator.hpp" #include "ck_tile/core/arch/mma/utility/tile_distribution_encoding_register_mapper.hpp" #include "ck_tile/core/arch/mma/wmma/wmma.hpp" #include "ck_tile/core/arch/mma/wmma/wmma_gfx11.hpp" diff --git a/include/ck_tile/core/arch/mma/amdgcn_mma.hpp b/include/ck_tile/core/arch/mma/amdgcn_mma.hpp index 63148faf99..bbf1217919 100644 --- a/include/ck_tile/core/arch/mma/amdgcn_mma.hpp +++ b/include/ck_tile/core/arch/mma/amdgcn_mma.hpp @@ -4,6 +4,8 @@ #pragma once #include "ck_tile/core/arch/arch.hpp" +#include "ck_tile/core/arch/mma/wmma/wmma_traits.hpp" +#include "ck_tile/core/arch/mma/mfma/mfma_traits.hpp" #include "ck_tile/core/arch/mma/mma_op_family.hpp" #include "ck_tile/core/config.hpp" #include "ck_tile/core/numeric/vector_type.hpp" @@ -87,7 +89,7 @@ namespace ck_tile::core::arch::mma { * * (logical correctness). Applies to scale MFMA fp8, which due to the index matrix layout does not * allow arbitrary K perms to simplify layouts. This means the layout can only properly be described - * with a Num Access value of at least 2. + * with a Num Access value which is a multiple of 2. * * (load / store manipulation). It seems like the load and store tile functions end up looking for * the size of the smallest unmerged K dimension (K0) to determine how many elements should be @@ -102,13 +104,16 @@ namespace ck_tile::core::arch::mma { * * -- CMPerLane -- * The number of M dim elements in each lane. In terms of unmerge sizes, it's equal to M0 * M2, i.e - * the product of the sizes of the outermost and innermost dimensions after a double M unmerge. + * the product of the sizes of the outermost and innermost dimensions after a double M unmerge. This + * does not count a potential increased M dimension size from block hiding. In this case, we have M + * = kCMBlock * M2 * M1 * M0 instead. * * -- CNumAccess -- * Same as A / B NumAccess but for the M dim (so M2), but the mid-level code doesn't care about this * and will not try to request a specific value. Absolutely needed for logical correctness of * register mappings since we can not perform arbitrary M permutations without messing up the A - * layout. + * layout. This does not count a potential increased M dimension size from block hiding. In this + * case, we have M = kCMBlock * M2 * M1 * M0 instead. */ /** @@ -144,7 +149,7 @@ struct amdgcn_mma_base using CDataType = CDataType_; // Fragment (MmaTile) sizes, check description above. - static constexpr index_t kM = FragM; // M = M2 * M1 * M0 + static constexpr index_t kM = FragM; // M = M2 * M1 * M0 (* kCMBlocks when block-hiding) static constexpr index_t kN = FragN; static constexpr index_t kK = FragK; // K = K2 * K1 * K0 @@ -157,15 +162,37 @@ struct amdgcn_mma_base static constexpr index_t kCMPerLane = kCMPerLane_; // M2 * M0 static constexpr index_t kCMNumAccess = kCMNumAccess_; // M2 + // K-dimension compression ratio for A matrix, always 2 for sparse intrinsics. + static constexpr index_t kCompressionRatio = (OpFamily == MmaOpFamily::SPARSE) ? 2 : 1; + + // Layout checks + static_assert(kK % kABKPerLane == 0); + static_assert(kABKPerLane % kAKNumAccess == 0); + static_assert(kABKPerLane % kBKNumAccess == 0); + static_assert(kCMPerLane % kCMNumAccess == 0); + // Register types (derived) static constexpr index_t WaveSize = WaveSize_; - static_assert((kM * kK * kARepeat) % WaveSize == 0); + static_assert((kM * kK * kARepeat) % (WaveSize * kCompressionRatio) == 0); static_assert((kN * kK * kBRepeat) % WaveSize == 0); static_assert((kM * kN) % WaveSize == 0); - using AVecType = ext_vector_t; + using AVecType = ext_vector_t; using BVecType = ext_vector_t; using CVecType = ext_vector_t; + + // Block-hiding / repeat related traits (derived) + static_assert(kARepeat == kBRepeat || !std::is_same_v); + static_assert(kARepeat == 1 || kBRepeat == 1 || !std::is_same_v); + static constexpr index_t kCMBlocks = std::is_same_v ? kBRepeat : 1; + static constexpr index_t kCNBlocks = std::is_same_v ? kARepeat : 1; + static_assert(kM % (kCMBlocks * kCMPerLane) == 0); + static_assert(kN % kCNBlocks == 0); + + // For the C matrix, the block dimension B is either put in the Vector dimension or the Lane + // dimension. We can tell which by checking if we get the right Vector size. + static constexpr bool CBlockDimInVecDim = + kCMBlocks * kCNBlocks * kCMPerLane == vector_traits::vector_size; }; /** @@ -181,6 +208,7 @@ struct Unsupported; * @concept MmaOpI * @brief Expresses the meta-data interface required for each MmaOp policy. */ +// TODO: Make sure this actually matches amdgcn_mma. template concept MmaOpI = requires(MmaOp op) { // Requires an op context @@ -194,7 +222,6 @@ concept MmaOpI = requires(MmaOp op) { typename MmaOp::AVecType; typename MmaOp::BVecType; typename MmaOp::CVecType; - // Captures CK-specific layout properties { MmaOp::kABKPerLane } -> std::convertible_to; { MmaOp::kAKNumAccess } -> std::convertible_to; diff --git a/include/ck_tile/core/arch/mma/mfma/mfma_gfx9.hpp b/include/ck_tile/core/arch/mma/mfma/mfma_gfx9.hpp index 4955e2bf7f..f48edc8688 100644 --- a/include/ck_tile/core/arch/mma/mfma/mfma_gfx9.hpp +++ b/include/ck_tile/core/arch/mma/mfma/mfma_gfx9.hpp @@ -51,6 +51,82 @@ struct amdgcn_mma +// clang-format off +// | A B C DataTypes | MNK + WaveSize |AParams |BPar |CPar | +struct amdgcn_mma> +: amdgcn_mma_base +// clang-format on +{ + CK_TILE_DEVICE static auto + exec(AVecType const& aVec, BVecType const& bVec, CVecType const& cVec) -> CVecType + { + return {__builtin_amdgcn_mfma_f32_32x32x4f16(aVec, + bVec, + cVec, + static_cast(CtrlFlags::Cbsz), + static_cast(CtrlFlags::Abid), + static_cast(CtrlFlags::Blgp))}; + } +}; + +template +// clang-format off +// | A B C DataTypes | MNK + WaveSize |AParams |BPar |CPar | +struct amdgcn_mma> +: amdgcn_mma_base +// clang-format on +{ + CK_TILE_DEVICE static auto + exec(AVecType const& aVec, BVecType const& bVec, CVecType const& cVec) -> CVecType + { + return {__builtin_amdgcn_mfma_f32_32x32x4f16(aVec, + bVec, + cVec, + static_cast(CtrlFlags::Cbsz), + static_cast(CtrlFlags::Abid), + static_cast(CtrlFlags::Blgp))}; + } +}; + +template +// clang-format off +// | A B C DataTypes | MNK + WaveSize |AParams |BPar |CPar | +struct amdgcn_mma> +: amdgcn_mma_base +// clang-format on +{ + CK_TILE_DEVICE static auto + exec(AVecType const& aVec, BVecType const& bVec, CVecType const& cVec) -> CVecType + { + return {__builtin_amdgcn_mfma_f32_4x4x4f16(aVec, + bVec, + cVec, + static_cast(CtrlFlags::Cbsz), + static_cast(CtrlFlags::Abid), + static_cast(CtrlFlags::Blgp))}; + } +}; + +template +// clang-format off +// | A B C DataTypes | MNK + WaveSize |AParams |BPar |CPar | +struct amdgcn_mma> +: amdgcn_mma_base +// clang-format on +{ + CK_TILE_DEVICE static auto + exec(AVecType const& aVec, BVecType const& bVec, CVecType const& cVec) -> CVecType + { + return {__builtin_amdgcn_mfma_f32_4x4x4f16(aVec, + bVec, + cVec, + static_cast(CtrlFlags::Cbsz), + static_cast(CtrlFlags::Abid), + static_cast(CtrlFlags::Blgp))}; + } +}; + /** * @struct amdgcn_mma * @brief Specialization of amdgcn_mma for MFMA on GFX950 targets diff --git a/include/ck_tile/core/arch/mma/sparse/mfma/sparse_gfx9.hpp b/include/ck_tile/core/arch/mma/sparse/mfma/sparse_gfx9.hpp index 0941f5cbec..781d496e5a 100644 --- a/include/ck_tile/core/arch/mma/sparse/mfma/sparse_gfx9.hpp +++ b/include/ck_tile/core/arch/mma/sparse/mfma/sparse_gfx9.hpp @@ -6,7 +6,6 @@ #include "ck_tile/core/config.hpp" #include "ck_tile/core/arch/arch.hpp" #include "ck_tile/core/numeric/vector_type.hpp" -#include "ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp" #include "ck_tile/core/arch/mma/sparse/sparse_traits.hpp" namespace ck_tile::core::arch::mma { @@ -31,25 +30,12 @@ struct amdgcn_mma CVecType + exec(AVecType const& aVec, BVecType const& bVec, CVecType const& cVec, int32_t idx) -> CVecType { - static constexpr index_t ABVecN = vector_traits::vector_size; - static constexpr index_t kCompressionRatio = 2; - static constexpr index_t CompressedSize = ABVecN / kCompressionRatio; - using AVecCompressed = ext_vector_t; - - static_assert(CompressedSize == 4); - // TODO: Compressing A on-the-fly should be OK for now, but we need to validate - // and evaluate changing this to a transform at a higher level. - // aVec not being const can cause problems when running multiple intrinsics. - const uint32_t idx = ck_tile::compress_a_impl(aVec); - - const AVecCompressed a_vec_pruned = {aVec[0], aVec[1], aVec[2], aVec[3]}; - using namespace sparse::detail; static constexpr BuiltinParams PARAMS = getBuiltinParams(); return {__builtin_amdgcn_smfmac_f32_16x16x32_f16( - a_vec_pruned, bVec, cVec, idx, PARAMS.UseFirstIndex, PARAMS.ByteIndexToOverride)}; + aVec, bVec, cVec, idx, PARAMS.UseFirstIndex, PARAMS.ByteIndexToOverride)}; } }; diff --git a/include/ck_tile/core/arch/mma/sparse/sparse_traits.hpp b/include/ck_tile/core/arch/mma/sparse/sparse_traits.hpp index 946a44c221..a551d9b08c 100644 --- a/include/ck_tile/core/arch/mma/sparse/sparse_traits.hpp +++ b/include/ck_tile/core/arch/mma/sparse/sparse_traits.hpp @@ -43,18 +43,15 @@ struct BuiltinParams template static constexpr BuiltinParams getBuiltinParams() { - BuiltinParams params; + // TODO c++20: designated initializers if constexpr(Idx == SparseCompressionIndex::FIRST) { - params.UseFirstIndex = 1; - params.ByteIndexToOverride = 0; + return BuiltinParams{1, 0}; } else { - params.UseFirstIndex = 0; - params.ByteIndexToOverride = static_cast(Idx); + return BuiltinParams{0, static_cast(Idx)}; } - return params; } } // namespace sparse::detail diff --git a/include/ck_tile/core/arch/mma/sparse/wmma/sparse_gfx12.hpp b/include/ck_tile/core/arch/mma/sparse/wmma/sparse_gfx12.hpp index 7981fd91aa..0648a45b29 100644 --- a/include/ck_tile/core/arch/mma/sparse/wmma/sparse_gfx12.hpp +++ b/include/ck_tile/core/arch/mma/sparse/wmma/sparse_gfx12.hpp @@ -7,7 +7,6 @@ #include "ck_tile/core/arch/arch.hpp" #include "ck_tile/core/arch/mma/amdgcn_mma.hpp" #include "ck_tile/core/numeric/vector_type.hpp" -#include "ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp" #include "ck_tile/core/arch/mma/sparse/sparse_traits.hpp" namespace ck_tile::core::arch::mma { @@ -21,23 +20,9 @@ struct amdgcn_mma CVecType + exec(AVecType const& aVec, BVecType const& bVec, CVecType const& cVec, int32_t idx) -> CVecType { - static constexpr index_t ABVecN = vector_traits::vector_size; - static constexpr index_t kCompressionRatio = 2; - static constexpr index_t CompressedSize = ABVecN / kCompressionRatio; - using AVecCompressed = ext_vector_t; - - static_assert(CompressedSize == 8); - // TODO: Compressing A on-the-fly should be OK for now, but we need to validate - // and evaluate changing this to a transform at a higher level. - // aVec not being const can cause problems when running multiple intrinsics. - const uint32_t idx = ck_tile::compress_a_impl(aVec); - - const AVecCompressed a_vec_pruned = { - aVec[0], aVec[1], aVec[2], aVec[3], aVec[4], aVec[5], aVec[6], aVec[7]}; - - return {__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a_vec_pruned, bVec, cVec, idx)}; + return {__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(aVec, bVec, cVec, idx)}; } }; diff --git a/include/ck_tile/core/arch/mma/utility/tile_distribution_encoding_calculator.hpp b/include/ck_tile/core/arch/mma/utility/tile_distribution_encoding_calculator.hpp new file mode 100644 index 0000000000..948e302fce --- /dev/null +++ b/include/ck_tile/core/arch/mma/utility/tile_distribution_encoding_calculator.hpp @@ -0,0 +1,114 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include "ck_tile/core/tensor/tile_distribution.hpp" +#include "ck_tile/core/arch/mma/utility/tile_distribution_encoding_register_mapper.hpp" + +namespace ck_tile::core::arch::mma { +/** + * @class TileDistrEncCalc + * @brief Given an MmaOp and modifiers, provides warp-level tile distribution encodings for mapping + * ABC matrix fragment coordinates to register coordinates (lane, vector item) and vice versa. + * @tparam MmaOp Intrinsic (amdgcn_mma). + * @tparam CTranspose Whether we are using CTranspose. + * @tparam SFactor Swizzle factor. Not implemented. + * @tparam AttrNumAccessA Requested NumAccess for the A matrix. Must be multiple of "fundamental" + * NumAccess for intrinsic. See details in amdgcn_mma.hpp. + * @tparam AttrNumAccessB Requested NumAccess for the B matrix. + */ +template +struct TileDistrEncCalc +{ + private: + static constexpr index_t NumAccessA = std::max(MmaOp::kAKNumAccess, AttrNumAccessA); + static constexpr index_t NumAccessB = std::max(MmaOp::kBKNumAccess, AttrNumAccessB); + + // We are free to choose any NumAccess value to manipulate the load / store behavior, unless the + // intrinsic fundamentally requires a base NumAccess factor for the layout to be correct. + static_assert(AttrNumAccessA % MmaOp::kAKNumAccess == 0, + "Requesting NumAccessA incompatible with builtin."); + static_assert(AttrNumAccessB % MmaOp::kBKNumAccess == 0, + "Requesting NumAccessB incompatible with builtin."); + + static_assert(MmaOp::kABKPerLane % NumAccessA == 0); + static_assert(MmaOp::kABKPerLane % NumAccessB == 0); + static_assert(SFactor == 1, "Swizzle not implemented yet."); // TODO: Implement Swizzle. + + template + using ABWarpDstrEnc = tile_distribution_encoding< + sequence, + tuple, + sequence>, + tuple>, + tuple>, + sequence<2, 2>, + sequence<0, 2>>; + + static constexpr auto get_cwarp_dstr_encoding() + { + // We unmerge the M and N dimensions in the same way every time. + using MSubDims = sequence; + using NSubDims = sequence; + + // In case of CTranspose, all we do is swap the M and N dimension. + using MatDims = + std::conditional_t, tuple>; + constexpr int MInx = CTranspose ? 2 : 1; + constexpr int NInx = CTranspose ? 1 : 2; + + // For MFMA intrinsics with blocks, the block dimensions might be in the Lane dim or in the + // Vec dim, so we get different merge orderings. + if constexpr(MmaOp::CBlockDimInVecDim) + { + return tile_distribution_encoding, + MatDims, + tuple>, + tuple>, + sequence, + sequence<0, 0, 1, 3>>{}; + } + else + { + return tile_distribution_encoding, + MatDims, + tuple>, + tuple>, + sequence, + sequence<1, 3>>{}; + } + } + + using AEnc_ = ABWarpDstrEnc; + using BEnc_ = ABWarpDstrEnc; + + public: + // When using CTranspose, the A and B matrices are swapped. + using AWarpDstrEncoding = std::conditional_t; + using BWarpDstrEncoding = std::conditional_t; + using CWarpDstrEncoding = decltype(get_cwarp_dstr_encoding()); + + // Some additional consistency checks + static_assert(TileDistrEncRegMap::num_lanes == MmaOp::WaveSize); + static_assert(TileDistrEncRegMap::num_lanes == MmaOp::WaveSize); + static_assert(TileDistrEncRegMap::num_lanes == MmaOp::WaveSize); + + static_assert(TileDistrEncRegMap::num_vector_items == + vector_traits::vector_size); + static_assert(TileDistrEncRegMap::num_vector_items == + vector_traits::vector_size); + static_assert(TileDistrEncRegMap::num_vector_items == + vector_traits::vector_size); +}; +} // namespace ck_tile::core::arch::mma diff --git a/test/ck_tile/core/arch/mma/CMakeLists.txt b/test/ck_tile/core/arch/mma/CMakeLists.txt index 964acfb02a..99ebd6ece3 100644 --- a/test/ck_tile/core/arch/mma/CMakeLists.txt +++ b/test/ck_tile/core/arch/mma/CMakeLists.txt @@ -7,10 +7,11 @@ if(CK_USE_OCP_FP8) list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8) endif() -if(GPU_TARGETS MATCHES "gfx9|gfx12") - add_gtest_executable(test_amdgcn_sparse_mma test_amdgcn_sparse_mma.cpp) - target_compile_options(test_amdgcn_sparse_mma PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) -endif() +# TODO: This test is temporarily disabled for cooperation / work planning reasons. Re-enable after merging related work. +# if(GPU_TARGETS MATCHES "gfx9|gfx12") +# add_gtest_executable(test_amdgcn_sparse_mma test_amdgcn_sparse_mma.cpp) +# target_compile_options(test_amdgcn_sparse_mma PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) +# endif() if(GPU_TARGETS MATCHES "gfx9") add_gtest_executable(test_amdgcn_mma test_amdgcn_mma.cpp) target_compile_options(test_amdgcn_mma PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) @@ -18,10 +19,28 @@ else() message(DEBUG "Skipping ck_tile_gemm tests for current target") endif() -if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12") - add_gtest_executable(test_amdgcn_mma_layout test_amdgcn_mma_layout.cpp) - target_compile_options(test_amdgcn_mma_layout PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) -else() - message(DEBUG "Skipping gfx9|gfx11|gfx12 mma layout validation tests for current target") +if(GPU_TARGETS MATCHES "gfx9") + add_gtest_executable(test_amdgcn_mma_layout_gfx9 test_amdgcn_mma_layout_gfx9.cpp) + target_compile_options(test_amdgcn_mma_layout_gfx9 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) +endif() + +if(GPU_TARGETS MATCHES "gfx942|gfx950") + add_gtest_executable(test_amdgcn_mma_layout_gfx942 test_amdgcn_mma_layout_gfx942.cpp) + target_compile_options(test_amdgcn_mma_layout_gfx942 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) +endif() + +if(GPU_TARGETS MATCHES "gfx950") + add_gtest_executable(test_amdgcn_mma_layout_gfx950 test_amdgcn_mma_layout_gfx950.cpp) + target_compile_options(test_amdgcn_mma_layout_gfx950 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) +endif() + +if(GPU_TARGETS MATCHES "gfx11") + add_gtest_executable(test_amdgcn_mma_layout_gfx11 test_amdgcn_mma_layout_gfx11.cpp) + target_compile_options(test_amdgcn_mma_layout_gfx11 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) +endif() + +if(GPU_TARGETS MATCHES "gfx12") + add_gtest_executable(test_amdgcn_mma_layout_gfx12 test_amdgcn_mma_layout_gfx12.cpp) + target_compile_options(test_amdgcn_mma_layout_gfx12 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS}) endif() diff --git a/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout.cpp b/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout.cpp deleted file mode 100644 index b25d7191e2..0000000000 --- a/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout.cpp +++ /dev/null @@ -1,304 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#include -#include - -#include "ck_tile/host/hip_check_error.hpp" -#include "ck_tile/host/stream_config.hpp" -#include "ck_tile/host/device_memory.hpp" -#include "ck_tile/host/kernel_launch.hpp" -#include "ck_tile/core/arch/arch.hpp" -#include "ck_tile/core/utility/env.hpp" - -#include "test/ck_tile/core/arch/mma/test_amdgcn_mma_layout_util.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace ck = ck_tile; -namespace mma = ck_tile::core::arch::mma; - -// MMA register layout validation test for amdgcn_mma structs. -// -// Strategy: for every (m, k, n) triple in the tile, the test constructs a pair of input tensors -// A and B that contain exactly one non-zero element each, placed so that their product -// contributes to a single output element C(m, n): -// -// A (M x K) B (K x N) C = A * B (M x N) -// . . . . . . . . . . . . . . . . . . . . . . . . -// . . . . . . . . . . . . . . . . . . . . . . . . -// . . . 1 . . . . . . . . . . . . . . . . . . . . -// . . . . . . . . . . . 1 . . . . . . . . . 1 . . -// . . . . . . . . . . . . . . . . . . . . . . . . -// A(m,k) = 1 B(k,n) = 1 C(m,n) = 1 -// -// The kernel uses RegisterMap to scatter A and B into the correct (lane, vecIdx) positions -// of the MMA fragment registers, executes the intrinsic, then uses RegisterMap again to -// gather back into C matrix. The position of "1" in C is checked against the expected (m, n) -// location. - -namespace { - -/** - * @class MmaLayoutTestKernel - * @brief Device kernel that performs C = AB using a given Mma op - * - * @tparam ADataType Data type of tensor A elements - * @tparam BDataType Data type of tensor B elements - * @tparam CDataType Data type of tensor C elements - * @tparam FragM M-dimension of the MMA tile - * @tparam FragN N-dimension of the MMA tile - * @tparam FragK K-dimension of the MMA tile - * @tparam BlockSize HIP block size - */ -template -struct MmaLayoutTestKernel -{ - static constexpr int kBlockSize = BlockSize; - - __device__ void operator()(uint32_t* error_flags) const - { - using Selector = - mma::MmaDefaultSelector; - using MmaOp = typename Selector::SelectedOp; - - if constexpr(mma::MmaOpTraits::IsSupported) - { - using AVecType = typename MmaOp::AVecType; - using BVecType = typename MmaOp::BVecType; - using CVecType = typename MmaOp::CVecType; - constexpr uint32_t a_vec_size = vector_traits::vector_size; - constexpr uint32_t b_vec_size = vector_traits::vector_size; - constexpr uint32_t c_vec_size = vector_traits::vector_size; - - const uint32_t lane = threadIdx.x; - - AVecType a_frag{}; - BVecType b_frag{}; - CVecType c_frag{}; - - // get (m, k, n), where "1" should be placed for this block - const uint32_t case_idx = static_cast(blockIdx.x); - const uint32_t m = case_idx / (MmaOp::kK * MmaOp::kN); - const uint32_t k = (case_idx / MmaOp::kN) % MmaOp::kK; - const uint32_t n = case_idx % MmaOp::kN; - - // place a single "1" in A/B fragments using (lane, vecIdx) -> (row, col) mapping - for(uint32_t v = 0; v < a_vec_size; ++v) - { - auto a_coords = RegisterMap::Register2AMap(lane, v); - if(static_cast(a_coords[0]) == m && - static_cast(a_coords[1]) == k) - { - a_frag[v] = static_cast(1); - } - } - - for(uint32_t v = 0; v < b_vec_size; ++v) - { - auto b_coords = RegisterMap::Register2BMap(lane, v); - if(static_cast(b_coords[0]) == n && - static_cast(b_coords[1]) == k) - { - b_frag[v] = static_cast(1); - } - } - - c_frag = MmaOp::exec(a_frag, b_frag, c_frag); - - uint32_t err = 0; - const CDataType tol = static_cast( - 1.0e-1f); // TODO: this tolerance might not be suitable for all data types and - // should be revisited if we add more configurations - for(uint32_t v = 0; v < c_vec_size; ++v) - { - auto c_coords = RegisterMap::Register2CMap(lane, v); - const uint32_t i = static_cast(c_coords[0]); - const uint32_t j = static_cast(c_coords[1]); - - const CDataType expected = - (i == m && j == n) ? static_cast(1) : static_cast(0); - const CDataType value = static_cast(c_frag[v]); - if(fabsf(static_cast(value - expected)) > static_cast(tol)) - { - err = 1; - } - } - - const uint32_t any_err = __any(err); - if(threadIdx.x == 0) - { - error_flags[case_idx] = any_err; - } - } - } -}; - -/** - * @brief Test driver: runs the test for a given MMA configuration. - * - * The testlaunches (mkn) test cases (one per block) to check all possible positions of the "1" in - * the A/B tensors. - * 1. Constructs A and B tensors with a single 1 at A(m,k) and B(k,n). - * 2. Executes MMA intrinsic to compute C tensor. - * 3. Checks if C has the 1 in the expected position. - * - * @tparam Selector Selector for the Mma operation - * @return true if the test ran on hardware; false if skipped (no device or unsupported) - */ -template -bool run_mma_layout_test() -{ - using MmaOp = typename Selector::SelectedOp; - using MmaTraits = mma::MmaOpTraits; - using ADataType = typename MmaOp::ADataType; - using BDataType = typename MmaOp::BDataType; - using CDataType = typename MmaOp::CDataType; - constexpr uint32_t FragM = MmaOp::kM; - constexpr uint32_t FragN = MmaOp::kN; - constexpr uint32_t FragK = MmaOp::kK; - constexpr auto selector_target_id = MmaTraits::CompilerTarget::TARGET_ID; - constexpr auto selector_wave_size = MmaTraits::CompilerTarget::WAVE_SIZE_ID; - - int device_count = 0; - hipDevice_t device{}; - HIP_CHECK_ERROR(hipGetDevice(&device)); - HIP_CHECK_ERROR(hipGetDeviceCount(&device_count)); - - hipDeviceProp_t props{}; - HIP_CHECK_ERROR(hipGetDeviceProperties(&props, device)); - - const auto runtime_target = - ck_tile::core::arch::hip_device_prop_gcn_arch_name_to_amdgcn_target_id(props.gcnArchName); - const bool has_device = device_count > 0; - - if(!has_device || runtime_target == ck_tile::core::arch::amdgcn_target_id::HOST || - runtime_target != selector_target_id || - props.warpSize != static_cast(selector_wave_size)) - { - return false; - } - - constexpr uint32_t total_cases = FragM * FragK * FragN; - ck_tile::DeviceMem d_errors(total_cases * sizeof(uint32_t)); - std::vector h_errors(total_cases, 0u); - - auto* d_error_ptr = static_cast(d_errors.GetDeviceBuffer()); - - std::ignore = hipGetLastError(); - - using Kernel = MmaLayoutTestKernel(selector_wave_size)>; - - std::ignore = - ck_tile::launch_kernel(ck_tile::stream_config{nullptr, false, 0, 0, 1}, - ck_tile::make_kernel(Kernel{}, - dim3(total_cases), - dim3(static_cast(selector_wave_size)), - 0, - d_error_ptr)); - - HIP_CHECK_ERROR(hipMemcpyAsync( - h_errors.data(), d_error_ptr, d_errors.GetBufferSize(), hipMemcpyDeviceToHost)); - HIP_CHECK_ERROR(hipStreamSynchronize(nullptr)); - - for(uint32_t case_idx = 0; case_idx < total_cases; ++case_idx) - { - const uint32_t m = case_idx / (FragK * FragN); - const uint32_t k = (case_idx / FragN) % FragK; - const uint32_t n = case_idx % FragN; - - EXPECT_EQ(h_errors[case_idx], 0u) << "Mismatch for m=" << m << " k=" << k << " n=" << n; - } - - return true; -} - -} // namespace - -// ==================== Test configurations per target ==================== -// TODO: currently we have only 1 specific target per test. This should be revisited to enable all -// the targets within the family (gfx12, gfx11, gfx9) -using MmaGfx1201CompilerTarget = decltype(ck_tile::core::arch::make_amdgcn_gfx12_target< - ck_tile::core::arch::amdgcn_target_id::GFX1201>()); -using MmaGfx90aCompilerTarget = decltype(ck_tile::core::arch::make_amdgcn_gfx9_target< - ck_tile::core::arch::amdgcn_target_id::GFX90A>()); -using MmaGfx1100CompilerTarget = decltype(ck_tile::core::arch::make_amdgcn_gfx11_target< - ck_tile::core::arch::amdgcn_target_id::GFX1100>()); - -using MmaGfx1201Selector = mma::MmaDefaultSelector; -using MmaGfx90aSelector = mma::MmaDefaultSelector; -using MmaGfx1100Selector = mma::MmaDefaultSelector; - -// clang-format off -using KernelTypes = ::testing::Types< - MmaGfx1201Selector, - MmaGfx90aSelector, - MmaGfx1100Selector - >; -// clang-format on - -template -class TestMmaLayout : public ::testing::Test -{ -}; - -TYPED_TEST_SUITE(TestMmaLayout, KernelTypes); - -TYPED_TEST(TestMmaLayout, Mma_16x16x16_F16_F16_F32) -{ - bool executed = run_mma_layout_test(); - - if(!executed) - { - GTEST_SKIP() << "No supported HIP device found. Skipping test."; - } -} diff --git a/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout.inc b/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout.inc new file mode 100644 index 0000000000..ec8ea2a830 --- /dev/null +++ b/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout.inc @@ -0,0 +1,239 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +#include "ck_tile/host/hip_check_error.hpp" +#include "ck_tile/host/stream_config.hpp" +#include "ck_tile/host/device_memory.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/core/arch/arch.hpp" + +#include +#include +#include + +namespace { + +using namespace ck_tile; +using namespace ck_tile::core::arch; +using namespace mma; + +using F16 = fp16_t; +using F32 = fp32_t; +using Target908 = decltype(make_amdgcn_gfx9_target()); +using Target942 = decltype(make_amdgcn_gfx9_target()); +using Target950 = decltype(make_amdgcn_gfx9_target()); +using Target11 = decltype(make_amdgcn_gfx11_target()); +using Target12 = decltype(make_amdgcn_gfx12_target()); + +// MMA register layout validation test for amdgcn_mma structs. +// +// Strategy: for every (m, k, n) triple in the tile, the test constructs a pair of input tensors A +// and B that contain exactly one non-zero element each, placed so that their product contributes to +// a single output element C(m, n): +// +// A (M x K) B (K x N) C = A * B (M x N) +// . . . . . . . . . . . . . . . . . . . . . . . . +// . . . . . . . . . . . . . . . . . . . . . . . . +// . . . 1 . . . . . . . . . . . . . . . . . . . . +// . . . . . . . . . . . 1 . . . . . . . . . 1 . . +// . . . . . . . . . . . . . . . . . . . . . . . . +// A(m,k) = 1 B(k,n) = 1 C(m,n) = 1 +// +// The kernel uses TileDistrEncRegMap to scatter A and B into the correct (lane, vecIdx) positions +// of the MMA fragment registers, executes the intrinsic, then uses TileDistrEncRegMap again to +// gather back into C matrix. The position of "1" in C is checked against the expected (m, n) +// location. + +/** + * @class MmaLayoutTestKernel + * @brief Device kernel that performs C = AB using a given Mma op + * @tparam MmaOp Intrinsic (amdgcn_mma) to be tested + */ +template // TODO: C++20 concept for MmaOp +struct MmaLayoutTestKernel +{ + static constexpr int kBlockSize = MmaOp::WaveSize; + + __device__ void operator()(uint32_t* error_flags) const + { + using ARegMap = TileDistrEncRegMap::AWarpDstrEncoding>; + using BRegMap = TileDistrEncRegMap::BWarpDstrEncoding>; + using CRegMap = TileDistrEncRegMap::CWarpDstrEncoding>; + + if constexpr(MmaOpTraits::IsSupported) + { + using AVecType = typename MmaOp::AVecType; + using BVecType = typename MmaOp::BVecType; + using CVecType = typename MmaOp::CVecType; + constexpr index_t a_vec_size = vector_traits::vector_size; + constexpr index_t b_vec_size = vector_traits::vector_size; + constexpr index_t c_vec_size = vector_traits::vector_size; + + const index_t lane = threadIdx.x; + + AVecType a_frag{}; + BVecType b_frag{}; + CVecType c_frag{}; + uint32_t sparse_idx{}; + static_assert(MmaOp::kCompressionRatio <= 2); // Allow only 4:2 compression (or no). + + // get (m, k, n), where "1" should be placed for this block + const index_t case_idx = blockIdx.x; + const index_t m = case_idx / (MmaOp::kK * MmaOp::kN); + const index_t k = (case_idx / MmaOp::kN) % MmaOp::kK; + const index_t n = case_idx % MmaOp::kN; + + // place a single "1" in A/B fragments using (lane, vecIdx) -> (row, col) mapping + for(index_t v = 0; v < a_vec_size; ++v) + { + auto a_coords = ARegMap::calc_matrix_indices_from_lane_vector(lane, v); + + // When dealing with sparse intrinsics, the A matrix is compressed in the K + // direction and we just put our "1" in the k / 2 position (rounded down). + if(a_coords[0] == m && a_coords[1] == (k / MmaOp::kCompressionRatio)) + { + a_frag[v] = 1; + + // Calc an appropriate sparse idx value for a single 1 in position k. We use a + // baseline index of 0x88888888. This sends each compressed index i to + // uncompressed index i * 2. If k is odd, we should send it to i * 2 + 1 + // instead. We update only the absolutely necessary pair of bits for this + // (idx[v*2:v*2+1]). Note that this simple calculation works for any 4:2 sparse + // intrinsic with up to 16 packed k elements per lane. + sparse_idx = 0x88888888 | ((k % 2) << (v * 2)); + } + } + + for(index_t v = 0; v < b_vec_size; ++v) + { + auto b_coords = BRegMap::calc_matrix_indices_from_lane_vector(lane, v); + if(b_coords[0] == n && b_coords[1] == k) + { + b_frag[v] = 1; + } + } + + if constexpr(MmaOpTraits::IsSparse) + { + c_frag = MmaOp::exec(a_frag, b_frag, c_frag, sparse_idx); + } + else + { + c_frag = MmaOp::exec(a_frag, b_frag, c_frag); + } + + // TODO: this tolerance might not be suitable for all data types and + // should be revisited if we add more configurations + const float tolerance = 1.0e-1f; + index_t err = 0; + + for(index_t v = 0; v < c_vec_size; ++v) + { + auto c_coords = CRegMap::calc_matrix_indices_from_lane_vector(lane, v); + + const float expected = (c_coords[0] == m && c_coords[1] == n) ? 1 : 0; + const float value = static_cast(c_frag[v]); + if(std::fabs(value - expected) > tolerance) + { + err = 1; + } + } + + const uint32_t any_err = __any(err); + if(threadIdx.x == 0) + { + error_flags[case_idx] = any_err; + } + } + } +}; + +/** + * @brief Test driver: runs the test for a given MMA configuration. + * + * The testlaunches (mkn) test cases (one per block) to check all possible positions of the "1" in + * the A/B tensors. + * 1. Constructs A and B tensors with a single 1 at A(m,k) and B(k,n). + * 2. Executes MMA intrinsic to compute C tensor. + * 3. Checks if C has the 1 in the expected position. + * + * @tparam MmaOp Intrinsic (amdgcn_mma) to be tested + */ +template // TODO: C++20 concept for MmaOp +void run_mma_layout_test() +{ + EXPECT_TRUE(MmaOpTraits::IsSupported) << "Unsupported MmaOp! Bad MmaOp in list!\n"; + + int device_count = 0; + hipDevice_t device{}; + HIP_CHECK_ERROR(hipGetDevice(&device)); + HIP_CHECK_ERROR(hipGetDeviceCount(&device_count)); + EXPECT_TRUE(device_count > 0) << "No device found!"; + + hipDeviceProp_t props{}; + HIP_CHECK_ERROR(hipGetDeviceProperties(&props, device)); + EXPECT_EQ(props.warpSize, static_cast(MmaOp::WaveSize)) + << "Device wavesize " << props.warpSize << " != Mma wavesize " << MmaOp::WaveSize; + + constexpr uint32_t total_cases = MmaOp::kM * MmaOp::kN * MmaOp::kK; + ck_tile::DeviceMem d_errors(total_cases * sizeof(uint32_t)); + std::vector h_errors(total_cases, 0u); + + auto* d_error_ptr = static_cast(d_errors.GetDeviceBuffer()); + + (void)hipGetLastError(); + + using Kernel = MmaLayoutTestKernel; + + ck_tile::launch_kernel( + ck_tile::stream_config{nullptr, false, 0, 0, 1}, + ck_tile::make_kernel(Kernel{}, dim3(total_cases), dim3(MmaOp::WaveSize), 0, d_error_ptr)); + + HIP_CHECK_ERROR(hipMemcpyAsync( + h_errors.data(), d_error_ptr, d_errors.GetBufferSize(), hipMemcpyDeviceToHost)); + HIP_CHECK_ERROR(hipStreamSynchronize(nullptr)); + + for(uint32_t case_idx = 0; case_idx < total_cases; ++case_idx) + { + const uint32_t m = case_idx / (MmaOp::kK * MmaOp::kN); + const uint32_t k = (case_idx / MmaOp::kN) % MmaOp::kK; + const uint32_t n = case_idx % MmaOp::kN; + + EXPECT_EQ(h_errors[case_idx], 0u) << "Mismatch for m=" << m << " k=" << k << " n=" << n; + } +} + +// Lists of intrinsics to test. +// clang-format off +using Gfx9Intrinsics = ::testing::Types< + amdgcn_mma, // mfma_f32_16x16x16f16 + amdgcn_mma, // mfma_f32_32x32x4f16 + amdgcn_mma, // mfma_f32_32x32x4f16 + amdgcn_mma, // mfma_f32_4x4x4f16 + amdgcn_mma // mfma_f32_4x4x4f16 + >; +using Gfx942Intrinsics = ::testing::Types< + amdgcn_mma // smfmac_f32_16x16x32_f16 +>; +using Gfx950Intrinsics = ::testing::Types< + amdgcn_mma // mfma_f32_16x16x32_f16 +>; +using Gfx11Intrinsics = ::testing::Types< + amdgcn_mma, Target11, MmaOpFamily::DENSE> // wmma_f32_16x16x16_f16_w32 +>; +using Gfx12Intrinsics = ::testing::Types< + amdgcn_mma, Target12, MmaOpFamily::DENSE>, // wmma_f32_16x16x16_f16_w32_gfx12 + amdgcn_mma // swmmac_f32_16x16x32_f16_w32 +>; +// clang-format on + +template +class TestMmaLayout : public ::testing::Test +{ +}; +} // namespace diff --git a/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout_gfx11.cpp b/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout_gfx11.cpp new file mode 100644 index 0000000000..618f0bfee4 --- /dev/null +++ b/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout_gfx11.cpp @@ -0,0 +1,6 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "test_amdgcn_mma_layout.inc" +TYPED_TEST_SUITE(TestMmaLayout, Gfx11Intrinsics); +TYPED_TEST(TestMmaLayout, Gfx11Intrinsics) { run_mma_layout_test(); } diff --git a/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout_gfx12.cpp b/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout_gfx12.cpp new file mode 100644 index 0000000000..74b294b74c --- /dev/null +++ b/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout_gfx12.cpp @@ -0,0 +1,6 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "test_amdgcn_mma_layout.inc" +TYPED_TEST_SUITE(TestMmaLayout, Gfx12Intrinsics); +TYPED_TEST(TestMmaLayout, Gfx12Intrinsics) { run_mma_layout_test(); } diff --git a/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout_gfx9.cpp b/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout_gfx9.cpp new file mode 100644 index 0000000000..91e219d1fb --- /dev/null +++ b/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout_gfx9.cpp @@ -0,0 +1,6 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "test_amdgcn_mma_layout.inc" +TYPED_TEST_SUITE(TestMmaLayout, Gfx9Intrinsics); +TYPED_TEST(TestMmaLayout, Gfx9Intrinsics) { run_mma_layout_test(); } diff --git a/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout_gfx942.cpp b/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout_gfx942.cpp new file mode 100644 index 0000000000..f7b2a8a0f7 --- /dev/null +++ b/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout_gfx942.cpp @@ -0,0 +1,6 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "test_amdgcn_mma_layout.inc" +TYPED_TEST_SUITE(TestMmaLayout, Gfx942Intrinsics); +TYPED_TEST(TestMmaLayout, Gfx942Intrinsics) { run_mma_layout_test(); } diff --git a/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout_gfx950.cpp b/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout_gfx950.cpp new file mode 100644 index 0000000000..3a78f88621 --- /dev/null +++ b/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout_gfx950.cpp @@ -0,0 +1,6 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "test_amdgcn_mma_layout.inc" +TYPED_TEST_SUITE(TestMmaLayout, Gfx950Intrinsics); +TYPED_TEST(TestMmaLayout, Gfx950Intrinsics) { run_mma_layout_test(); } diff --git a/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout_util.hpp b/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout_util.hpp deleted file mode 100644 index 3b33fa56a6..0000000000 --- a/test/ck_tile/core/arch/mma/test_amdgcn_mma_layout_util.hpp +++ /dev/null @@ -1,306 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#pragma once - -#include "ck_tile/core.hpp" -#include "ck_tile/core/arch/arch.hpp" -#include "ck_tile/core/arch/mma/amdgcn_mma.hpp" -#include "ck_tile/core/arch/mma/mma_selector.hpp" -#include "ck_tile/core/numeric/half.hpp" -#include "ck_tile/core/numeric/vector_type.hpp" -#include "ck_tile/core/arch/mma/utility/tile_distribution_encoding_register_mapper.hpp" - -#include -#include - -namespace { - -using namespace ck_tile; - -/** - * @class RegisterMapTraits - * @brief Traits class that defines tile_distribution_encoding for each MmaOp - * @tparam MmaOp amdgcn_mma specialization - */ -template -struct RegisterMapTraits -{ - static_assert(sizeof(MmaOp) == 0, "RegisterMapTraits requires a specialization"); -}; - -/** - * @class RegisterMap - * @brief Uses specialized RegisterMapTraits to get the encoding - * @tparam MmaOp amdgcn_mma specialization - */ -template -struct RegisterMap -{ - using Traits = RegisterMapTraits; - - using AMap = core::arch::mma::TileDistrEncRegMap; - using BMap = core::arch::mma::TileDistrEncRegMap; - using CMap = core::arch::mma::TileDistrEncRegMap; - - CK_TILE_HOST_DEVICE static auto Register2AMap(const uint32_t lane, const uint32_t vecIdx) - { - return AMap::calc_matrix_indices_from_lane_vector(static_cast(lane), - static_cast(vecIdx)); - } - - CK_TILE_HOST_DEVICE static auto Register2BMap(const uint32_t lane, const uint32_t vecIdx) - { - return BMap::calc_matrix_indices_from_lane_vector(static_cast(lane), - static_cast(vecIdx)); - } - - CK_TILE_HOST_DEVICE static auto Register2CMap(const uint32_t lane, const uint32_t vecIdx) - { - return CMap::calc_matrix_indices_from_lane_vector(static_cast(lane), - static_cast(vecIdx)); - } -}; - -// ====================== Specializations per target ===================== - -/** - * @brief RegisterMapTraits for GFX12 WMMA 16x16x16_F16_F16_F32_GFX12 - */ -template -struct RegisterMapTraits>> -{ - using MmaOp = ck_tile::core::arch::mma::amdgcn_mma; - - static constexpr index_t AVecSize = vector_traits::vector_size; - static constexpr index_t BVecSize = vector_traits::vector_size; - static constexpr index_t CVecSize = vector_traits::vector_size; - - using kABPs2RHssMajor = sequence<2, 1>; - using kABPs2RHssMinor = sequence<1, 0>; - using kABYs2RHsMajor = sequence<2, 2>; - using kABYs2RHsMinor = sequence<0, 2>; - using kCPs2RHssMajor = sequence<1, 2>; - using kCPs2RHssMinor = sequence<1, 0>; - using kCYs2RHsMajor = sequence<1, 1>; - using kCYs2RHsMinor = sequence<0, 2>; - - // TODO: remove these and fix constants in amdgcn_mma - static constexpr index_t kAMBlock = 1; - static constexpr index_t kBNBlock = 1; - static constexpr index_t kAMLane = 16; - static constexpr index_t kBNLane = 16; - static constexpr index_t kABK0PerLane = 1; - static constexpr index_t kABKLane = 2; - static constexpr index_t kABK1PerLane = 8; - static constexpr index_t kCMLane = 2; - static constexpr index_t kCNLane = 16; - static constexpr index_t kCM0PerLane = 1; - static constexpr index_t kCM1PerLane = 8; - - using AWarpDstrEncoding = tile_distribution_encoding< - sequence<1>, - tuple, sequence>, // <16>, <1, 2, 8> - tuple, - tuple, - kABYs2RHsMajor, - kABYs2RHsMinor>; - - using BWarpDstrEncoding = tile_distribution_encoding< - sequence<1>, - tuple, sequence>, // <16>, <1, 2, 8> - tuple, - tuple, - kABYs2RHsMajor, - kABYs2RHsMinor>; - - using CWarpDstrEncoding = - tile_distribution_encoding, - tuple, - sequence>, // <1, 2, 8>, <16> - tuple, - tuple, - kCYs2RHsMajor, - kCYs2RHsMinor>; -}; - -/** - * @brief RegisterMapTraits for GFX9 MFMA 16x16x16_F16_F16_F32_GFX9 - */ -template -struct RegisterMapTraits>> -{ - using MmaOp = ck_tile::core::arch::mma::amdgcn_mma; - - static constexpr index_t AVecSize = vector_traits::vector_size; - static constexpr index_t BVecSize = vector_traits::vector_size; - static constexpr index_t CVecSize = vector_traits::vector_size; - - using kABPs2RHssMajor = sequence<2, 1>; - using kABPs2RHssMinor = sequence<0, 0>; - using kABYs2RHsMajor = sequence<2>; - using kABYs2RHsMinor = sequence<1>; - using kCPs2RHssMajor = sequence<1, 2>; - using kCPs2RHssMinor = sequence<0, 0>; - using kCYs2RHsMajor = sequence<1>; - using kCYs2RHsMinor = sequence<1>; - - // TODO: remove these and fix constants in amdgcn_mma - static constexpr index_t kAMBlock = 1; - static constexpr index_t kBNBlock = 1; - static constexpr index_t kAMLane = 16; - static constexpr index_t kBNLane = 16; - static constexpr index_t kABKLane = 4; - static constexpr index_t kABKPerLane = 4; - static constexpr index_t kCMLane = 4; - static constexpr index_t kCNLane = 16; - static constexpr index_t kCM0PerLane = 1; - static constexpr index_t kCM1PerLane = 4; - - using AWarpDstrEncoding = - tile_distribution_encoding, - tuple, sequence>, - tuple, - tuple, - kABYs2RHsMajor, - kABYs2RHsMinor>; - - using BWarpDstrEncoding = - tile_distribution_encoding, - tuple, sequence>, - tuple, - tuple, - kABYs2RHsMajor, - kABYs2RHsMinor>; - - using CWarpDstrEncoding = - tile_distribution_encoding, - tuple, sequence>, - tuple, - tuple, - kCYs2RHsMajor, - kCYs2RHsMinor>; -}; - -/** - * @brief RegisterMapTraits for GFX11 WMMA 16x16x16_F16_F16_F32_GFX11 - */ -template -struct RegisterMapTraits>> -{ - using MmaOp = ck_tile::core::arch::mma::amdgcn_mma; - - static constexpr index_t AVecSize = vector_traits::vector_size; - static constexpr index_t BVecSize = vector_traits::vector_size; - static constexpr index_t CVecSize = vector_traits::vector_size; - - using kABPs2RHssMajor = sequence<0, 1>; - using kABPs2RHssMinor = sequence<0, 0>; - using kABYs2RHsMajor = sequence<2>; - using kABYs2RHsMinor = sequence<0>; - using kCPs2RHssMajor = sequence<1, 2>; - using kCPs2RHssMinor = sequence<1, 0>; - using kCYs2RHsMajor = sequence<1>; - using kCYs2RHsMinor = sequence<0>; - - // TODO: remove these and fix constants in amdgcn_mma - static constexpr index_t kAMBlock = 1; - static constexpr index_t kBNBlock = 1; - static constexpr index_t kAMLane = 16; - static constexpr index_t kBNLane = 16; - static constexpr index_t kABK0PerLane = 1; - static constexpr index_t kABKLane = 1; - static constexpr index_t kABK1PerLane = 16; - static constexpr index_t kCMLane = 2; - static constexpr index_t kCNLane = 16; - static constexpr index_t kCM0PerLane = 8; - static constexpr index_t kCM1PerLane = 1; - - using AWarpDstrEncoding = - tile_distribution_encoding, // kRepeat - tuple, sequence>, - tuple, - tuple, - kABYs2RHsMajor, - kABYs2RHsMinor>; - - using BWarpDstrEncoding = - tile_distribution_encoding, // kRepeat - tuple, sequence>, - tuple, - tuple, - kABYs2RHsMajor, - kABYs2RHsMinor>; - - using CWarpDstrEncoding = - tile_distribution_encoding, - tuple, sequence>, - tuple, - tuple, - kCYs2RHsMajor, - kCYs2RHsMinor>; -}; - -// ======================================================================== - -} // namespace