[rocm-libraries] ROCm/rocm-libraries#4804 (commit 832dd0e)

Add Tile Distribution Encoding Register Mapping debug utility for MFMA / WMMA unification work. (#4804) ## Motivation This PR adds a small utility that allows you to use Tile Distribution Encodings to directly map matrix elements to register locations and vice versa. It can also print forward and backward layout mappings similar to the Matrix Calculator utility. The utility is not meant for index calculations in actual kernels, but rather as a debugging tool and probably for automated verification of the policy structs in the new WMMA / MFMA unification design. ## Technical Details Tile Distribution Encodings are a core part of CK Tile which can define the relationship between register and intrinsic matrix fragment elements. They allow for any mapping based on unmerge and merge transformations. Also, they allow for a special "Repeat" dimensions which acts like an additional matrix dimension and allows for replication of certain matrix elements. The new mapping utility can deal with all aspects. ## Test Plan Since this is a debug utility there is nothing to directly test, but there is an example file that defines four different Tile Distribution Encodings and prints their forward and backward mappings, along with some extra parameters. ## Test Result ## Submission Checklist - [x] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
2026-06-05 20:55:59 +00:00 · 2026-03-02 16:42:42 +00:00
parent a8e2ec22cf
commit b042e1805a
4 changed files with 256 additions and 0 deletions
--- a/example/ck_tile/51_tile_distr_enc_reg_map/CMakeLists.txt
+++ b/example/ck_tile/51_tile_distr_enc_reg_map/CMakeLists.txt
@@ -0,0 +1,4 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+add_executable(tile_example_tile_distr_enc_reg_map example_tile_distr_enc_reg_map.cpp)
--- a/example/ck_tile/51_tile_distr_enc_reg_map/example_tile_distr_enc_reg_map.cpp
+++ b/example/ck_tile/51_tile_distr_enc_reg_map/example_tile_distr_enc_reg_map.cpp
@@ -0,0 +1,76 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <stdio.h>
+#include "ck_tile/core/arch/mma/utility/tile_distribution_encoding_register_mapper.hpp"
+
+using namespace ck_tile;
+using namespace ck_tile::core::arch::mma;
+
+int main()
+{
+    // Define some tile distribution encodings and print register mappings.
+
+    printf("Example RDNA3 V_WMMA_F32_16X16X16_F16 A Matrix (M, K)\nL{RM} V{K}\n");
+    TileDistrEncRegMap<
+        tile_distribution_encoding<sequence<2>, // R (= Repeat) Lanes 0-15 are duplicated at 16-31
+                                   tuple<sequence<16>, sequence<16>>, // H (= Hidden dims = unmerged
+                                                                      // dims) for M, K dimension
+                                   tuple<sequence<0, 1>>, // P major (= Parallelism = lanes)
+                                   tuple<sequence<0, 0>>, // P minor
+                                   sequence<2>,           // Y major (= Yield = Vector items)
+                                   sequence<0>            // Y minor
+                                   >>::print();
+
+    printf("\nExample RDNA3 V_WMMA_F32_16X16X16_F16 C Matrix (M, N)\nM{2, 1} L{M1N} V{M2M0} (dummy "
+           "unmerge to be more similar to other layouts)\n");
+    TileDistrEncRegMap<
+        tile_distribution_encoding<sequence<>,                             // R (= Repeat)
+                                   tuple<sequence<8, 2, 1>, sequence<16>>, // H (= Hidden dims =
+                                                                           // unmerged dims) for M,
+                                                                           // N dimension
+                                   tuple<sequence<1, 2>>, // P major (= Parallelism = lanes)
+                                   tuple<sequence<1, 0>>, // P minor
+                                   sequence<1, 1>,        // Y major (= Yield = Vector items)
+                                   sequence<0, 2>         // Y minor
+                                   >>::print();
+
+    printf("\nExample CDNA __builtin_amdgcn_mfma_f32_4x4x4f16 A Matrix (M, K) with 16x "
+           "block-hiding in the M dimension\nL{BM} V{K}\n");
+    TileDistrEncRegMap<
+        tile_distribution_encoding<sequence<>,                          // R (= Repeat)
+                                   tuple<sequence<16, 4>, sequence<4>>, // H (= Hidden dims =
+                                                                        // unmerged dims) for M,
+                                                                        // K dimension
+                                   tuple<sequence<1, 1>>, // P major (= Parallelism = lanes)
+                                   tuple<sequence<0, 1>>, // P minor
+                                   sequence<2>,           // Y major (= Yield = Vector items)
+                                   sequence<0>            // Y minor
+                                   >>::print();
+
+    printf("\nExample CDNA __builtin_amdgcn_mfma_f32_4x4x4f16 B Matrix (N, K) with 16x "
+           "block-hiding in the M dimension\nL{BN} V{K}\n");
+    TileDistrEncRegMap<
+        tile_distribution_encoding<sequence<16>,                    // R (= Repeat)
+                                   tuple<sequence<4>, sequence<4>>, // H (= Hidden dims =
+                                                                    // unmerged dims) for N,
+                                                                    // K dimension
+                                   tuple<sequence<0, 1>>, // P major (= Parallelism = lanes)
+                                   tuple<sequence<0, 0>>, // P minor
+                                   sequence<2>,           // Y major (= Yield = Vector items)
+                                   sequence<0>            // Y minor
+                                   >>::print();
+
+    printf("\nCustom example\n");
+    TileDistrEncRegMap<
+        tile_distribution_encoding<sequence<1>,                            // R (= Repeat)
+                                   tuple<sequence<16>, sequence<1, 2, 8>>, // H (= Hidden dims =
+                                                                           // unmerged dims)
+                                   tuple<sequence<2, 1>>, // P major (= Parallelism = lanes)
+                                   tuple<sequence<1, 0>>, // P minor
+                                   sequence<2, 2>,        // Y major (= Yield = Vector items)
+                                   sequence<0, 2>         // Y minor
+                                   >>::print();
+
+    return 0;
+}
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -31,4 +31,5 @@ add_subdirectory(38_block_scale_gemm)
 add_subdirectory(40_streamk_gemm)
 add_subdirectory(41_batched_contraction)
 add_subdirectory(50_sparse_attn)
+add_subdirectory(51_tile_distr_enc_reg_map)

--- a/include/ck_tile/core/arch/mma/utility/tile_distribution_encoding_register_mapper.hpp
+++ b/include/ck_tile/core/arch/mma/utility/tile_distribution_encoding_register_mapper.hpp
@@ -0,0 +1,175 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+/**
+ * @file tile_distribution_encoding_register_mapper.hpp
+ * @brief Utility for register / matrix coordinate mapping from TileDistributionEncoding
+ * @details Defines TileDistrEncRegMap, which takes a TileDistributionEncoding and provides
+ * functions for mapping matrix fragment coordinates to register coordinates (lane, vector item) and
+ * vice versa. This is only meant for tile distributions encodings that describe register mappings.
+ *
+ * A repeat dimension is allowed in which case multiple (lane, vector item) pairs are mapped to the
+ * same matrix coordinates. The inverse map takes a "repeat index" to distinguish between them.
+ *
+ * print() functions are included for printing dimensions and formatted forward and backwards
+ * mappings similar to the AMD Matrix Calculator.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include "ck_tile/core/tensor/tensor_descriptor.hpp"
+#include "ck_tile/core/tensor/tile_distribution.hpp"
+
+namespace ck_tile::core::arch::mma {
+
+// Utility to calculate register mappings from a Tile Distribution Encoding.
+template <typename TileDistrEnc>
+struct TileDistrEncRegMap
+{
+    // Make sure this is a proper Tile Distr Encoding for Lane Vector mapping.
+    static_assert(TileDistrEnc::NDimR <= 1);
+    static_assert(TileDistrEnc::NDimX == 2);
+    static_assert(TileDistrEnc::NDimP == 1);
+
+    static constexpr auto ps_ys_to_xs_adaptor =
+        make_static_tile_distribution(TileDistrEnc{}).get_ps_ys_to_xs_adaptor();
+
+    static constexpr index_t mat_major_size =
+        container_reduce(typename TileDistrEnc::HsLengthss{}[number<0>{}], multiplies<>{}, 1);
+    static constexpr index_t mat_minor_size =
+        container_reduce(typename TileDistrEnc::HsLengthss{}[number<1>{}], multiplies<>{}, 1);
+    static constexpr index_t num_repeat = [] {
+        if constexpr(TileDistrEnc::NDimR > 0)
+        {
+            return typename TileDistrEnc::RsLengths{}[number<0>{}];
+        }
+        else
+        {
+            return 1; // Necessary to deal with empty "repeat" sequences.
+        }
+    }();
+    static constexpr index_t num_lanes = ps_ys_to_xs_adaptor.get_top_dimension_length(number<0>{});
+    static constexpr index_t num_vector_items =
+        container_reduce(TileDistrEnc::detail::ys_lengths_, multiplies<>{}, 1);
+
+    // Check for 0 dims (will break things much earlier but let's have an extra check).
+    static_assert(mat_major_size > 0);
+    static_assert(mat_minor_size > 0);
+    static_assert(num_repeat > 0);
+    static_assert(num_lanes > 0);
+    static_assert(num_vector_items > 0);
+
+    CK_TILE_HOST_DEVICE static constexpr auto
+    calc_matrix_indices_from_lane_vector(index_t lane_inx, index_t vector_inx)
+    {
+        // For some reason the Y dimension is not treated the same as the P dimension and we need to
+        // manually unmerge the Y dimension index into its hidden indices before being able to use
+        // it...
+        array<index_t, TileDistrEnc::NDimY> y_hidden_inx;
+        for(index_t i = TileDistrEnc::NDimY - 1; i >= 0; --i)
+        {
+            y_hidden_inx[i] = vector_inx % TileDistrEnc::detail::ys_lengths_[i];
+            vector_inx /= TileDistrEnc::detail::ys_lengths_[i];
+        }
+
+        const auto ps_ys_idx = container_concat(array<index_t, 1>{lane_inx}, y_hidden_inx);
+        return ps_ys_to_xs_adaptor.calculate_bottom_index(ps_ys_idx);
+    }
+
+    struct LaneVec
+    {
+        index_t lane = -1; // Sentinel for invalid pairs
+        index_t vec  = -1;
+    };
+
+    using InverseMap =
+        std::array<std::array<std::array<LaneVec, num_repeat>, mat_minor_size>, mat_major_size>;
+
+    // TODO: In theory this could be done with inverted merge unmerge operations.
+    CK_TILE_HOST_DEVICE static constexpr InverseMap calc_inverse_map()
+    {
+        InverseMap im{};
+        for(index_t l = 0; l < num_lanes; ++l)
+        {
+            for(index_t v = 0; v < num_vector_items; ++v)
+            {
+                auto res = calc_matrix_indices_from_lane_vector(l, v); // Matrix major, minor inx;
+
+                // We assume that repeated matrix elements appear at increasing L and V indices.
+                for(index_t r = 0; r < num_repeat; r++)
+                {
+                    auto& lv = im[res[0]][res[1]][r];
+                    if(lv.lane < 0)
+                    {
+                        lv.lane = l; // TODO: c++20 designated initializers
+                        lv.vec  = v;
+                    }
+                }
+            }
+        }
+        return im;
+    }
+
+    CK_TILE_HOST_DEVICE static void print_dims()
+    {
+        printf("Matrix dims major, minor, repeat = %d %d %d\n",
+               mat_major_size,
+               mat_minor_size,
+               num_repeat);
+        printf("Num lanes, vector items = %d %d\n", num_lanes, num_vector_items);
+    }
+
+    CK_TILE_HOST_DEVICE static void print_mapping()
+    {
+        printf("(lane, vector) item to matrix element\n L | ");
+        for(index_t v = 0; v < num_vector_items; v++)
+        {
+            printf("vec%2d | ", v);
+        }
+        printf("\n");
+
+        for(index_t l = 0; l < num_lanes; l++)
+        {
+            printf("%2d | ", l);
+            for(index_t v = 0; v < num_vector_items; v++)
+            {
+                auto res = calc_matrix_indices_from_lane_vector(l, v);
+                printf("%2d %2d | ", res[0], res[1]);
+            }
+            printf("\n");
+        }
+    }
+
+    CK_TILE_HOST_DEVICE static void print_inverse_mapping()
+    {
+        InverseMap im = calc_inverse_map();
+        printf("Matrix element to (lane, vector item). Elements are replicated an additional %d "
+               "time(s) in higher lanes. \n",
+               num_repeat - 1);
+        printf("Mat| ");
+        for(index_t k = 0; k < mat_minor_size; k++)
+        {
+            printf("   %2d | ", k);
+        }
+        printf("\n");
+
+        for(index_t m = 0; m < mat_major_size; m++)
+        {
+            printf("%2d | ", m);
+            for(index_t k = 0; k < mat_minor_size; k++)
+            {
+                printf("%2d %2d | ", im[m][k][0].lane, im[m][k][0].vec);
+            }
+            printf("\n");
+        }
+    }
+
+    CK_TILE_HOST_DEVICE static void print()
+    {
+        print_dims();
+        print_mapping();
+        print_inverse_mapping();
+    }
+};
+} // namespace ck_tile::core::arch::mma