diff --git a/example/ck_tile/51_tile_distr_enc_reg_map/CMakeLists.txt b/example/ck_tile/51_tile_distr_enc_reg_map/CMakeLists.txt new file mode 100644 index 0000000000..59352336ce --- /dev/null +++ b/example/ck_tile/51_tile_distr_enc_reg_map/CMakeLists.txt @@ -0,0 +1,4 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +add_executable(tile_example_tile_distr_enc_reg_map example_tile_distr_enc_reg_map.cpp) diff --git a/example/ck_tile/51_tile_distr_enc_reg_map/example_tile_distr_enc_reg_map.cpp b/example/ck_tile/51_tile_distr_enc_reg_map/example_tile_distr_enc_reg_map.cpp new file mode 100644 index 0000000000..300d52d119 --- /dev/null +++ b/example/ck_tile/51_tile_distr_enc_reg_map/example_tile_distr_enc_reg_map.cpp @@ -0,0 +1,76 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include +#include "ck_tile/core/arch/mma/utility/tile_distribution_encoding_register_mapper.hpp" + +using namespace ck_tile; +using namespace ck_tile::core::arch::mma; + +int main() +{ + // Define some tile distribution encodings and print register mappings. + + printf("Example RDNA3 V_WMMA_F32_16X16X16_F16 A Matrix (M, K)\nL{RM} V{K}\n"); + TileDistrEncRegMap< + tile_distribution_encoding, // R (= Repeat) Lanes 0-15 are duplicated at 16-31 + tuple, sequence<16>>, // H (= Hidden dims = unmerged + // dims) for M, K dimension + tuple>, // P major (= Parallelism = lanes) + tuple>, // P minor + sequence<2>, // Y major (= Yield = Vector items) + sequence<0> // Y minor + >>::print(); + + printf("\nExample RDNA3 V_WMMA_F32_16X16X16_F16 C Matrix (M, N)\nM{2, 1} L{M1N} V{M2M0} (dummy " + "unmerge to be more similar to other layouts)\n"); + TileDistrEncRegMap< + tile_distribution_encoding, // R (= Repeat) + tuple, sequence<16>>, // H (= Hidden dims = + // unmerged dims) for M, + // N dimension + tuple>, // P major (= Parallelism = lanes) + tuple>, // P minor + sequence<1, 1>, // Y major (= Yield = Vector items) + sequence<0, 2> // Y minor + >>::print(); + + printf("\nExample CDNA __builtin_amdgcn_mfma_f32_4x4x4f16 A Matrix (M, K) with 16x " + "block-hiding in the M dimension\nL{BM} V{K}\n"); + TileDistrEncRegMap< + tile_distribution_encoding, // R (= Repeat) + tuple, sequence<4>>, // H (= Hidden dims = + // unmerged dims) for M, + // K dimension + tuple>, // P major (= Parallelism = lanes) + tuple>, // P minor + sequence<2>, // Y major (= Yield = Vector items) + sequence<0> // Y minor + >>::print(); + + printf("\nExample CDNA __builtin_amdgcn_mfma_f32_4x4x4f16 B Matrix (N, K) with 16x " + "block-hiding in the M dimension\nL{BN} V{K}\n"); + TileDistrEncRegMap< + tile_distribution_encoding, // R (= Repeat) + tuple, sequence<4>>, // H (= Hidden dims = + // unmerged dims) for N, + // K dimension + tuple>, // P major (= Parallelism = lanes) + tuple>, // P minor + sequence<2>, // Y major (= Yield = Vector items) + sequence<0> // Y minor + >>::print(); + + printf("\nCustom example\n"); + TileDistrEncRegMap< + tile_distribution_encoding, // R (= Repeat) + tuple, sequence<1, 2, 8>>, // H (= Hidden dims = + // unmerged dims) + tuple>, // P major (= Parallelism = lanes) + tuple>, // P minor + sequence<2, 2>, // Y major (= Yield = Vector items) + sequence<0, 2> // Y minor + >>::print(); + + return 0; +} diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index 9646e93b4e..099076d4a9 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -31,4 +31,5 @@ add_subdirectory(38_block_scale_gemm) add_subdirectory(40_streamk_gemm) add_subdirectory(41_batched_contraction) add_subdirectory(50_sparse_attn) +add_subdirectory(51_tile_distr_enc_reg_map) diff --git a/include/ck_tile/core/arch/mma/utility/tile_distribution_encoding_register_mapper.hpp b/include/ck_tile/core/arch/mma/utility/tile_distribution_encoding_register_mapper.hpp new file mode 100644 index 0000000000..53af3da360 --- /dev/null +++ b/include/ck_tile/core/arch/mma/utility/tile_distribution_encoding_register_mapper.hpp @@ -0,0 +1,175 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +/** + * @file tile_distribution_encoding_register_mapper.hpp + * @brief Utility for register / matrix coordinate mapping from TileDistributionEncoding + * @details Defines TileDistrEncRegMap, which takes a TileDistributionEncoding and provides + * functions for mapping matrix fragment coordinates to register coordinates (lane, vector item) and + * vice versa. This is only meant for tile distributions encodings that describe register mappings. + * + * A repeat dimension is allowed in which case multiple (lane, vector item) pairs are mapped to the + * same matrix coordinates. The inverse map takes a "repeat index" to distinguish between them. + * + * print() functions are included for printing dimensions and formatted forward and backwards + * mappings similar to the AMD Matrix Calculator. + */ + +#pragma once + +#include +#include "ck_tile/core/tensor/tensor_descriptor.hpp" +#include "ck_tile/core/tensor/tile_distribution.hpp" + +namespace ck_tile::core::arch::mma { + +// Utility to calculate register mappings from a Tile Distribution Encoding. +template +struct TileDistrEncRegMap +{ + // Make sure this is a proper Tile Distr Encoding for Lane Vector mapping. + static_assert(TileDistrEnc::NDimR <= 1); + static_assert(TileDistrEnc::NDimX == 2); + static_assert(TileDistrEnc::NDimP == 1); + + static constexpr auto ps_ys_to_xs_adaptor = + make_static_tile_distribution(TileDistrEnc{}).get_ps_ys_to_xs_adaptor(); + + static constexpr index_t mat_major_size = + container_reduce(typename TileDistrEnc::HsLengthss{}[number<0>{}], multiplies<>{}, 1); + static constexpr index_t mat_minor_size = + container_reduce(typename TileDistrEnc::HsLengthss{}[number<1>{}], multiplies<>{}, 1); + static constexpr index_t num_repeat = [] { + if constexpr(TileDistrEnc::NDimR > 0) + { + return typename TileDistrEnc::RsLengths{}[number<0>{}]; + } + else + { + return 1; // Necessary to deal with empty "repeat" sequences. + } + }(); + static constexpr index_t num_lanes = ps_ys_to_xs_adaptor.get_top_dimension_length(number<0>{}); + static constexpr index_t num_vector_items = + container_reduce(TileDistrEnc::detail::ys_lengths_, multiplies<>{}, 1); + + // Check for 0 dims (will break things much earlier but let's have an extra check). + static_assert(mat_major_size > 0); + static_assert(mat_minor_size > 0); + static_assert(num_repeat > 0); + static_assert(num_lanes > 0); + static_assert(num_vector_items > 0); + + CK_TILE_HOST_DEVICE static constexpr auto + calc_matrix_indices_from_lane_vector(index_t lane_inx, index_t vector_inx) + { + // For some reason the Y dimension is not treated the same as the P dimension and we need to + // manually unmerge the Y dimension index into its hidden indices before being able to use + // it... + array y_hidden_inx; + for(index_t i = TileDistrEnc::NDimY - 1; i >= 0; --i) + { + y_hidden_inx[i] = vector_inx % TileDistrEnc::detail::ys_lengths_[i]; + vector_inx /= TileDistrEnc::detail::ys_lengths_[i]; + } + + const auto ps_ys_idx = container_concat(array{lane_inx}, y_hidden_inx); + return ps_ys_to_xs_adaptor.calculate_bottom_index(ps_ys_idx); + } + + struct LaneVec + { + index_t lane = -1; // Sentinel for invalid pairs + index_t vec = -1; + }; + + using InverseMap = + std::array, mat_minor_size>, mat_major_size>; + + // TODO: In theory this could be done with inverted merge unmerge operations. + CK_TILE_HOST_DEVICE static constexpr InverseMap calc_inverse_map() + { + InverseMap im{}; + for(index_t l = 0; l < num_lanes; ++l) + { + for(index_t v = 0; v < num_vector_items; ++v) + { + auto res = calc_matrix_indices_from_lane_vector(l, v); // Matrix major, minor inx; + + // We assume that repeated matrix elements appear at increasing L and V indices. + for(index_t r = 0; r < num_repeat; r++) + { + auto& lv = im[res[0]][res[1]][r]; + if(lv.lane < 0) + { + lv.lane = l; // TODO: c++20 designated initializers + lv.vec = v; + } + } + } + } + return im; + } + + CK_TILE_HOST_DEVICE static void print_dims() + { + printf("Matrix dims major, minor, repeat = %d %d %d\n", + mat_major_size, + mat_minor_size, + num_repeat); + printf("Num lanes, vector items = %d %d\n", num_lanes, num_vector_items); + } + + CK_TILE_HOST_DEVICE static void print_mapping() + { + printf("(lane, vector) item to matrix element\n L | "); + for(index_t v = 0; v < num_vector_items; v++) + { + printf("vec%2d | ", v); + } + printf("\n"); + + for(index_t l = 0; l < num_lanes; l++) + { + printf("%2d | ", l); + for(index_t v = 0; v < num_vector_items; v++) + { + auto res = calc_matrix_indices_from_lane_vector(l, v); + printf("%2d %2d | ", res[0], res[1]); + } + printf("\n"); + } + } + + CK_TILE_HOST_DEVICE static void print_inverse_mapping() + { + InverseMap im = calc_inverse_map(); + printf("Matrix element to (lane, vector item). Elements are replicated an additional %d " + "time(s) in higher lanes. \n", + num_repeat - 1); + printf("Mat| "); + for(index_t k = 0; k < mat_minor_size; k++) + { + printf(" %2d | ", k); + } + printf("\n"); + + for(index_t m = 0; m < mat_major_size; m++) + { + printf("%2d | ", m); + for(index_t k = 0; k < mat_minor_size; k++) + { + printf("%2d %2d | ", im[m][k][0].lane, im[m][k][0].vec); + } + printf("\n"); + } + } + + CK_TILE_HOST_DEVICE static void print() + { + print_dims(); + print_mapping(); + print_inverse_mapping(); + } +}; +} // namespace ck_tile::core::arch::mma