Export ROCm/rocm-libraries@2d4a3223cb

2026-05-05 22:22:27 +00:00 · 2026-03-11 23:03:20 -04:00
commit e6cd3f1e3f
6330 changed files with 1132789 additions and 0 deletions
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -0,0 +1,526 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_custom_policy.hpp"
+#include "ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp"
+
+namespace ck_tile {
+
+struct UniversalFlatmmPipelineAgBgCrPolicy
+{
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+    static constexpr auto I2 = number<2>{};
+
+    // 3d + padding
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
+    {
+        using namespace ck_tile;
+
+        constexpr index_t MPerXdl = Problem::BlockGemmShape::WarpTile::at(I0);
+        constexpr index_t NPerXdl = Problem::BlockGemmShape::WarpTile::at(I1);
+        if constexpr(MPerXdl == 16 && NPerXdl == 16)
+        {
+            /*reduce transform layers,compare with old ck*/
+            constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+            constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+            constexpr index_t KPack     = GetSmemPackA<Problem>();
+
+            constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+                make_tuple(number<KPerBlock / KPack>{}, number<MPerBlock>{}, number<KPack>{}),
+                make_tuple(number<KPack>{}, number<KPerBlock>{}, number<1>{}),
+                number<KPack>{},
+                number<1>{});
+
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc_0,
+                make_tuple(make_xor_transform(
+                               make_tuple(number<MPerBlock>{}, number<KPerBlock / KPack>{})),
+                           make_pass_through_transform(number<KPack>{})),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}));
+
+            constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(make_pass_through_transform(number<MPerBlock>{}),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
+                make_tuple(sequence<1>{}, sequence<0, 2>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+
+            return a_lds_block_desc;
+        }
+        else
+        {
+            constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+            constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+            constexpr index_t kKPack     = GetSmemPackA<Problem>();
+
+            constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+                make_tuple(number<kKPerBlock / kKPack>{}, number<kMPerBlock>{}, number<kKPack>{}),
+                make_tuple(number<(kMPerBlock + 1) * kKPack>{}, number<kKPack>{}, number<1>{}),
+                number<kKPack>{},
+                number<1>{});
+
+            constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+                a_lds_block_desc_0,
+                make_tuple(make_pass_through_transform(kMPerBlock),
+                           make_merge_transform(make_tuple(kKPerBlock / kKPack, kKPack))),
+                make_tuple(sequence<1>{}, sequence<0, 2>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+
+            return a_lds_block_desc;
+        }
+/*xor*/
+#if 0
+        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t kKPack     = GetSmemPackA<Problem>();
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+
+        constexpr auto DataTypeSize = sizeof(ADataType);
+        constexpr auto MLdsLayer =
+            (32 * 4 / kKPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / kKPerBlock / DataTypeSize);
+
+        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<kKPerBlock / kKPack * MLdsLayer>{},
+                    number<kMPerBlock / MLdsLayer>{},
+                    number<kKPack>{}),
+            make_tuple(number<kKPack>{}, number<kKPerBlock * MLdsLayer>{}, number<1>{}),
+            number<kKPack>{},
+            number<1>{});
+
+        constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+            a_lds_block_desc_0,
+            make_tuple(make_xor_transform(make_tuple(number<kMPerBlock / MLdsLayer>{},
+                                                    number<kKPerBlock / kKPack * MLdsLayer>{})),
+                    make_pass_through_transform(number<kKPack>{})),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}));
+
+        constexpr auto a_lds_block_desc_xk0_mnldslayer_mn_xk1 = transform_tensor_descriptor(
+            a_lds_block_desc_permuted,
+            make_tuple(make_unmerge_transform(
+                        make_tuple(number<MLdsLayer>{}, number<kKPerBlock / kKPack>{})),
+                        make_pass_through_transform(number<kMPerBlock / MLdsLayer>{}),
+                        make_pass_through_transform(number<kKPack>{})),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+            make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
+
+        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+            a_lds_block_desc_xk0_mnldslayer_mn_xk1,
+            make_tuple(make_merge_transform(
+                        make_tuple(number<kMPerBlock / MLdsLayer>{}, number<MLdsLayer>{})),
+                        make_merge_transform(
+                        make_tuple(number<kKPerBlock / kKPack>{}, number<kKPack>{}))),
+            make_tuple(sequence<1, 0>{}, sequence<2, 3>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+        return a_lds_block_desc;
+#endif
+    }
+
+    /**
+     * @brief Get the maximum global memory vector load size.
+     *
+     * @tparam Problem      The UniversalGemmPipelineProblem object.
+     * @tparam DataType     The tensor data type we're considering.
+     * @tparam MNPerBlock   The MPerBlock or NPerBlock value depending on tensor (A/B).
+     * @tparam XPerTile     The contiguous Tile dimension size.
+     * @return Maximum DRAM vector load size.
+     */
+    template <typename Problem, typename DataType, index_t MNPerBlock, index_t XPerTile>
+    CK_TILE_HOST_DEVICE static constexpr auto GetGlobalVectorLoadSize()
+    {
+        constexpr index_t BlockSize           = Problem::kBlockSize;
+        constexpr index_t KPerBlock           = Problem::BlockGemmShape::kK;
+        constexpr index_t elements_per_thread = MNPerBlock * KPerBlock / BlockSize;
+        constexpr index_t PackedSize =
+            ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;
+
+        // Assume DataType is even!
+        if constexpr(XPerTile % (PackedSize * 32 / sizeof(DataType)) == 0 &&
+                     elements_per_thread % (PackedSize * 32 / sizeof(DataType)) == 0 &&
+                     PackedSize == 2)
+        {
+            return (PackedSize * 32 / sizeof(DataType));
+        }
+        else if constexpr(XPerTile % (PackedSize * 16 / sizeof(DataType)) == 0 &&
+                          elements_per_thread % (PackedSize * 16 / sizeof(DataType)) == 0)
+        {
+            return (PackedSize * 16 / sizeof(DataType));
+        }
+        else if constexpr(XPerTile % (PackedSize * 8 / sizeof(DataType)) == 0 &&
+                          elements_per_thread % (PackedSize * 8 / sizeof(DataType)) == 0)
+        {
+            return (PackedSize * 8 / sizeof(DataType));
+        }
+        else if constexpr(sizeof(DataType) >= PackedSize * 4 &&
+                          XPerTile % (PackedSize * 4 / sizeof(DataType)) == 0 &&
+                          elements_per_thread % (PackedSize * 4 / sizeof(DataType)) == 0)
+        {
+            return (PackedSize * 4 / sizeof(DataType));
+        }
+        else if constexpr(sizeof(DataType) >= PackedSize * 2 &&
+                          XPerTile % (PackedSize * 2 / sizeof(DataType)) == 0 &&
+                          elements_per_thread % (PackedSize * 2 / sizeof(DataType)) == 0)
+        {
+            return (PackedSize * 2 / sizeof(DataType));
+        }
+        else
+        {
+            return PackedSize;
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeA()
+    {
+        using ALayout               = remove_cvref_t<typename Problem::ALayout>;
+        using ADataType             = remove_cvref_t<typename Problem::ADataType>;
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::RowMajor>)
+        {
+            return GetGlobalVectorLoadSize<Problem, ADataType, MPerBlock, KPerBlock>();
+        }
+        else
+        {
+            return GetGlobalVectorLoadSize<Problem, ADataType, MPerBlock, MPerBlock>();
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetVectorSizeB()
+    {
+        using BLayout               = remove_cvref_t<typename Problem::BLayout>;
+        using BDataType             = remove_cvref_t<typename Problem::BDataType>;
+        constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+        {
+            return GetGlobalVectorLoadSize<Problem, BDataType, NPerBlock, NPerBlock>();
+        }
+        else
+        {
+            return GetGlobalVectorLoadSize<Problem, BDataType, NPerBlock, KPerBlock>();
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeA()
+    {
+        return sizeof(typename Problem::ADataType) *
+               MakeALdsBlockDescriptor<Problem>().get_element_space_size();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return GetSmemSizeA<Problem>();
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemPackA()
+    {
+        using A           = remove_cvref_t<typename Problem::ADataType>;
+        using BlockFlatmm = remove_cvref_t<decltype(GetBlockFlatmm<Problem>())>;
+
+        constexpr index_t KPack    = BlockFlatmm::BlockPolicy::WarpGemm::kKPerThread;
+        constexpr index_t VecElems = Problem::VectorLoadSize / sizeof(A);
+
+        return min(KPack, VecElems);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetKBPerLoad()
+    {
+        using TileShape = typename Problem::BlockGemmShape;
+        if constexpr(TileShape::WarpTile::at(I1) == 32)
+        {
+            return TileShape::WarpTile::at(I2) / 2;
+        }
+        else
+        {
+            static_assert(TileShape::WarpTile::at(I1) == 16);
+            return TileShape::WarpTile::at(I2) / 4;
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeALDS_WarpTileDistribution()
+    {
+        using TileShape = typename Problem::BlockGemmShape;
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+
+        static_assert(TileShape::BlockWarps::at(I0) == 1, "requires Wave_M == 1");
+
+        constexpr index_t MPerXdl = Problem::BlockGemmShape::WarpTile::at(I0);
+        constexpr index_t KPerXdl = Problem::BlockGemmShape::WarpTile::at(I2);
+
+        constexpr int Repeat = TileShape::BlockWarps::at(number<1>{});
+
+        constexpr int KLane      = get_warp_size() / MPerXdl;
+        constexpr int KPerThread = KPerXdl / KLane;
+
+        constexpr int MaxVecSize    = 16 / sizeof(ADataType);
+        constexpr int KItemsPerLoad = min(MaxVecSize, KPerThread);
+        constexpr int KFragment     = KPerThread / KItemsPerLoad;
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<Repeat>,
+                tuple<sequence<MPerXdl>, sequence<KFragment, KLane, KItemsPerLoad>>,
+                tuple<sequence<0>, sequence<2, 1>>,
+                tuple<sequence<0>, sequence<1, 0>>,
+                sequence<2, 2>,
+                sequence<0, 2>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeADramTileDistribution()
+    {
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr index_t APackedSize = numeric_traits<ADataType>::PackedSize;
+
+        if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
+        {
+            constexpr index_t M1 = Problem::VectorLoadSize / sizeof(ADataType) * APackedSize;
+            constexpr index_t M0 = MPerBlock / M1;
+            constexpr index_t total_pixels = MPerBlock * KPerBlock / BlockSize;
+            static_assert(total_pixels % M1 == 0);
+            constexpr index_t K3    = total_pixels / M1;
+            constexpr index_t KPack = GetSmemPackA<Problem>();
+            static_assert(KPack % K3 == 0);
+            constexpr index_t K2 = KPack / K3;
+            if constexpr(get_warp_size() >= (K2 * M0))
+            {
+                constexpr index_t K1 = get_warp_size() / (K2 * M0);
+                constexpr index_t K0 = BlockSize / get_warp_size();
+                static_assert(KPerBlock == K0 * K1 * K2 * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1>, sequence<K0, K1, K2, K3>>,
+                                               tuple<sequence<2>, sequence<2, 1, 2>>,
+                                               tuple<sequence<0>, sequence<1, 0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+            else
+            {
+                constexpr index_t K1   = (K2 * M0) / get_warp_size();
+                constexpr index_t K2_m = K2 / K1;
+                constexpr index_t K0   = BlockSize / get_warp_size() / K1;
+                static_assert(KPerBlock == K0 * K1 * K2_m * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1>, sequence<K0, K1, K2_m, K3>>,
+                                               tuple<sequence<2, 2>, sequence<1, 2>>,
+                                               tuple<sequence<0, 1>, sequence<0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+        }
+        else
+        {
+            constexpr index_t K1 = Problem::VectorLoadSize / sizeof(ADataType) * APackedSize;
+            constexpr index_t K0 = KPerBlock / K1;
+            // coalesce reading for each blocks
+            if constexpr(get_warp_size() % K0 == 0)
+            {
+                constexpr index_t M2 = get_warp_size() / K0;
+                constexpr index_t M1 = BlockSize / get_warp_size();
+                static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error.");
+                static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error.");
+                constexpr index_t M0 = MPerBlock / (M2 * M1);
+                static_assert(M0 * M1 * M2 == MPerBlock,
+                              "Incorrect M0, M2, M1 configuration! "
+                              "M0, M1, M2 must cover whole MPerBlock!");
+
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<1>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<0, 1>>{});
+            }
+            else
+            {
+                constexpr index_t KWave = K0 / get_warp_size();
+                constexpr index_t M0    = BlockSize / get_warp_size() / KWave;
+                constexpr index_t M1    = MPerBlock / M0;
+
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<
+                        sequence<1>,
+                        tuple<sequence<M0, M1>, sequence<KWave, get_warp_size(), K1>>,
+                        tuple<sequence<1, 2>, sequence<2>>,
+                        tuple<sequence<0, 0>, sequence<1>>,
+                        sequence<1, 2>,
+                        sequence<1, 2>>{});
+            }
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeADramDistribution()
+    {
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+
+        // constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr index_t K1 = 16 / sizeof(ADataType);
+        constexpr index_t K0 = KPerBlock / K1;
+        constexpr index_t M2 = get_warp_size() / K0;
+        constexpr index_t M1 = BlockSize / get_warp_size();
+        static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error.");
+        static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error.");
+        // constexpr index_t M0 = MPerBlock / (M2 * M1);
+        // static_assert(M0 * M1 * M2 == MPerBlock,
+        //                 "Incorrect M0, M2, M1 configuration! "
+        //                 "M0, M1, M2 must cover whole MPerBlock!");
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<M1, M2>, sequence<K0, K1>>,
+                                       tuple<sequence<1>, sequence<1, 2>>,
+                                       tuple<sequence<0>, sequence<1, 0>>,
+                                       sequence<2>,
+                                       sequence<1>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBFlatDramTileDistribution()
+    {
+        using TileShape = typename Problem::BlockGemmShape; // ck_tile::TileFlatmmShape
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+        constexpr index_t WaveSize  = get_warp_size();
+        constexpr index_t WaveNum   = BlockSize / WaveSize;
+
+        constexpr index_t KBPerLoad = GetKBPerLoad<Problem>();
+
+        constexpr index_t MaxVecSize    = 16 / sizeof(typename Problem::BDataType);
+        constexpr index_t KItemsPerLoad = min(KBPerLoad, MaxVecSize);
+        constexpr index_t KFragment     = KBPerLoad / KItemsPerLoad;
+        static_assert(KFragment * KItemsPerLoad == KBPerLoad);
+
+        constexpr index_t KThdPerWave = WaveSize; // threads cnt in K dim./
+        constexpr index_t KWavePerBlk = 1;
+        static_assert(TileShape::flatKPerWarp == KThdPerWave * KBPerLoad, "wrong");
+        static_assert(TileShape::BlockWarps::at(number<2>{}) == 1, "Requires K_Warp == 1");
+
+        constexpr index_t NBPerLoad   = 1;
+        constexpr index_t NThdPerWave = 1;
+        constexpr index_t NWavePerBlk = TileShape::BlockWarps::at(number<1>{}); // N_Warp
+        constexpr index_t NRepeat     = 1;
+
+        constexpr index_t WaveRepeat = WaveNum / TileShape::flatNPerWarp;
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<WaveRepeat>,                                         // ?
+                tuple<sequence<NRepeat, NWavePerBlk, NThdPerWave, NBPerLoad>, // second direction
+                      sequence<KFragment, KWavePerBlk, KThdPerWave, KItemsPerLoad>>, // first
+                                                                                     // direction
+                // wave in blk,     // thd in wave
+                // <M, K>           // <M, K>
+                tuple<sequence<0, 1, 2>, sequence<1, 2>>, // which direction
+                tuple<sequence<0, 1, 1>, sequence<2, 2>>, // which index
+                // <repeat, vec_load>
+                sequence<1, 1, 2, 2>,
+                sequence<0, 3, 0, 3>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledARegBlockDistribution()
+    {
+        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        static_assert(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>);
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr index_t M1           = Problem::VectorLoadSize / sizeof(ADataType);
+        constexpr index_t M0           = kMPerBlock / M1;
+        constexpr index_t total_pixels = kMPerBlock * kKPerBlock / kBlockSize;
+        static_assert(total_pixels % M1 == 0);
+        constexpr index_t K3     = total_pixels / M1;
+        constexpr index_t kKPack = GetSmemPackA<Problem>();
+        static_assert(kKPack % K3 == 0);
+        constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
+        constexpr index_t warp_size = get_warp_size();
+        if constexpr(warp_size >= (K2 * M0))
+        {
+            constexpr index_t K1 = warp_size / (K2 * M0);
+            constexpr index_t K0 = kBlockSize / warp_size;
+
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<M0, M1>, sequence<K0, K1, K2, K3>>,
+                                           tuple<sequence<2>, sequence<2, 1, 2>>,
+                                           tuple<sequence<0>, sequence<1, 0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+        else
+        {
+            constexpr index_t K1   = (K2 * M0) / get_warp_size();
+            constexpr index_t K2_m = K2 / K1;
+            constexpr index_t K0   = kBlockSize / get_warp_size() / K1;
+            static_assert(kKPerBlock == K0 * K1 * K2_m * K3);
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<M0, M1>, sequence<K0, K1, K2_m, K3>>,
+                                           tuple<sequence<2, 2>, sequence<1, 2>>,
+                                           tuple<sequence<0, 1>, sequence<0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockFlatmm()
+    {
+        // using AccDataType = float;
+        using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile   = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm   = WarpGemmDispatcher<typename Problem::ADataType,
+                                              typename Problem::BDataType,
+                                              typename Problem::CDataType,
+                                              WarpTile::at(I0),
+                                              WarpTile::at(I1),
+                                              WarpTile::at(I2),
+                                              Problem::TransposeC>;
+
+        using BlockFlatmmPolicy = BlockFlatmmASmemBSmemCRegV1CustomPolicy<
+            typename Problem::ADataType,
+            // BlockGemmASmemBSmemCRegV1CustomPolicy<typename
+            // Problem::ADataType,
+            typename Problem::BDataType,
+            typename Problem::CDataType,
+            BlockWarps,
+            WarpGemm>;
+        return BlockFlatmmASmemBSmemCRegV1<Problem, BlockFlatmmPolicy>{};
+    }
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
--- a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -0,0 +1,748 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp"
+
+namespace ck_tile {
+
+#define CKTILE_FLATMM_USE_BUFFER_LOAD_LDS_AS_POSSIBLE 0
+
+#if defined(__gfx950__)
+#define CKTILE_FLATMM_ARCH_SUPPORT_BUFFER_LOAD_LDS_DWORDx4 1
+#else
+#define CKTILE_FLATMM_ARCH_SUPPORT_BUFFER_LOAD_LDS_DWORDx4 0
+#endif
+
+#define CKTILE_FLATMM_USE_BUFFER_LOAD_LDS             \
+    (CKTILE_FLATMM_USE_BUFFER_LOAD_LDS_AS_POSSIBLE && \
+     CKTILE_FLATMM_ARCH_SUPPORT_BUFFER_LOAD_LDS_DWORDx4)
+
+struct F16xMXF4FlatmmPipelineAgBgCrPolicy : UniversalFlatmmPipelineAgBgCrPolicy
+{
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+    static constexpr auto I2 = number<2>{};
+
+    static constexpr index_t KBPerLoad = 32;
+    static constexpr index_t N_Pack    = 2; // it's fixed for fp4
+    static constexpr index_t K_Pack    = 2; // it's fixed for fp4
+
+    template <typename Problem, typename NativeADramTensorView>
+    CK_TILE_HOST_DEVICE static constexpr auto
+    TransformF16xF4_ATensorView(const NativeADramTensorView& a_dram_view)
+    {
+#if CKTILE_FLATMM_USE_BUFFER_LOAD_LDS
+        constexpr int DynamicTileOffsetFlag = 0;
+
+        constexpr index_t MPerXdl = Problem::BlockGemmShape::WarpTile::at(I0);
+        constexpr index_t NPerXdl = Problem::BlockGemmShape::WarpTile::at(I1);
+
+        static_assert(MPerXdl == 16 && NPerXdl == 16);
+
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t KPack     = GetSmemPackA<Problem>();
+
+        constexpr int ContiguousThreadsCntInDS_READ_16B = 4;
+
+        // implement swizzle pattern on global side
+        // because we can't adjust the ds_write pattern of BUFFER_LOAD_LDS.
+        auto swizzle_a_dram_view_1 = transform_tensor_view(
+            a_dram_view,
+            make_tuple(
+                // M-dim is not affected by swizzle pattern
+                make_unmerge_transform(
+                    make_tuple(number<DynamicTileOffsetFlag>{}, number<MPerBlock>{})),
+                // K-dim is the swizzle dimension
+                make_unmerge_transform(make_tuple(number<DynamicTileOffsetFlag>{},
+                                                  number<KPerBlock / KPack>{},
+                                                  number<KPack>{}))),
+            make_tuple(sequence<0>{}, sequence<1>{}),
+            make_tuple(sequence<0, 1>{}, sequence<2, 3, 4>{}));
+
+        auto swizzle_a_dram_view_2 = transform_tensor_view(
+            swizzle_a_dram_view_1,
+            make_tuple(make_pass_through_transform(number<DynamicTileOffsetFlag>{}),
+                       make_xor_transform(make_tuple(number<MPerBlock>{},
+                                                     number<ContiguousThreadsCntInDS_READ_16B>{})),
+                       make_pass_through_transform(number<DynamicTileOffsetFlag>{}),
+                       make_pass_through_transform(number<KPack>{})),
+            make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}, sequence<4>{}),
+            make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}, sequence<4>{}));
+
+        return transform_tensor_view(
+            swizzle_a_dram_view_2,
+            make_tuple(
+                make_merge_transform_v3_division_mod(
+                    make_tuple(number<DynamicTileOffsetFlag>{}, number<MPerBlock>{})),
+                make_merge_transform_v3_division_mod(make_tuple(number<DynamicTileOffsetFlag>{},
+                                                                number<KPerBlock / KPack>{},
+                                                                number<KPack>{}))),
+            make_tuple(sequence<0, 1>{}, sequence<2, 3, 4>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+#else
+        return a_dram_view;
+#endif
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeF16xF4_ReadALdsBlockDescriptor()
+    {
+        constexpr index_t MPerXdl = Problem::BlockGemmShape::WarpTile::at(I0);
+        constexpr index_t NPerXdl = Problem::BlockGemmShape::WarpTile::at(I1);
+
+        static_assert(MPerXdl == 16 && NPerXdl == 16);
+
+        /*reduce transform layers,compare with old ck*/
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t KPack     = GetSmemPackA<Problem>();
+
+        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<KPerBlock / KPack>{}, number<MPerBlock>{}, number<KPack>{}),
+            make_tuple(number<KPack>{}, number<KPerBlock>{}, number<1>{}),
+            number<KPack>{},
+            number<1>{});
+
+        constexpr int ContiguousThreadsCntInDS_READ_16B = 4;
+
+        constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+            a_lds_block_desc_0,
+            make_tuple(make_xor_transform(make_tuple(number<MPerBlock>{},
+                                                     number<ContiguousThreadsCntInDS_READ_16B>{})),
+                       make_pass_through_transform(number<KPack>{})),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}));
+
+        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+            a_lds_block_desc_permuted,
+            make_tuple(make_pass_through_transform(number<MPerBlock>{}),
+                       make_merge_transform_v3_division_mod(
+                           make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
+            make_tuple(sequence<1>{}, sequence<0, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return a_lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeF16xF4_WriteALdsBlockDescriptor()
+    {
+#if CKTILE_FLATMM_USE_BUFFER_LOAD_LDS
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t KPack     = GetSmemPackA<Problem>();
+        return make_naive_tensor_descriptor(make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+                                            make_tuple(number<KPerBlock>{}, number<1>{}),
+                                            number<KPack>{},
+                                            number<1>{});
+#else
+        return MakeF16xF4_ReadALdsBlockDescriptor<Problem>();
+#endif
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeF16xF4_ALDS_TileDistribution()
+    {
+        using TileShape = typename Problem::BlockGemmShape;
+
+        static_assert(TileShape::WarpTile::at(I1) == 16, "requires XDL_N == 16");
+        static_assert(TileShape::BlockWarps::at(I0) == 1, "requires Wave_M == 1");
+
+        constexpr int Repeat = TileShape::BlockWarps::at(number<1>{});
+        constexpr int M0     = TileShape::WarpTile::at(I0);
+
+        constexpr int K_Lane = 64 / TileShape::WarpTile::at(I1); // 4
+
+        constexpr int K2             = TileShape::WarpTile::at(I2) / K_Lane; // 128 / 4 = 32
+        constexpr int XDL_PerThreadK = KBPerLoad / K2;                       // 4
+        constexpr int K0             = K_Lane;                               // 4
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<Repeat>,
+                                       tuple<sequence<M0>, sequence<K0, XDL_PerThreadK, K2>>,
+                                       tuple<sequence<0>, sequence<2, 1>>,
+                                       tuple<sequence<0>, sequence<0, 0>>,
+                                       sequence<2>,
+                                       sequence<2>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeFp4BFlatDramTileDistribution()
+    {
+        using TileShape = typename Problem::BlockGemmShape;
+
+        static_assert(TileShape::WarpTile::at(I1) == 16, "only for XDL_N == 16");
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+        constexpr index_t WaveSize  = get_warp_size();
+        constexpr index_t WaveNum   = BlockSize / WaveSize;
+
+        constexpr index_t KThdPerWave = WaveSize; // threads cnt in K dim
+        constexpr index_t KWavePerBlk = 1;
+
+        constexpr index_t NWavePerBlk = TileShape::BlockWarps::at(number<1>{}); // N_Warp
+
+        constexpr index_t WaveRepeat = WaveNum / TileShape::flatNPerWarp;
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<WaveRepeat>,                                 // ?
+                tuple<sequence<NWavePerBlk, N_Pack>,                  // second
+                                                                      // direction
+                      sequence<KWavePerBlk, KThdPerWave, KBPerLoad>>, // first  direction
+                // wave in blk,     // thd in wave
+                // <M, K>           // <M, K>
+                tuple<sequence<0, 1, 2>, sequence<2>>, // which direction
+                tuple<sequence<0, 0, 0>, sequence<1>>, // which index
+                // <repeat, vec_load>
+                sequence<2>,
+                sequence<2>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeFp4ScaleBFlatDramTileDistribution()
+    {
+        using TileShape = typename Problem::BlockGemmShape; // ck_tile::TileFlatmmShape
+
+        constexpr index_t BlockSize                = Problem::kBlockSize;
+        constexpr index_t WaveSize                 = get_warp_size();
+        [[maybe_unused]] constexpr index_t WaveNum = BlockSize / WaveSize;
+
+        constexpr index_t N_Warp = TileShape::BlockWarps::at(number<1>{});
+
+        [[maybe_unused]] constexpr index_t XDLPerBlock =
+            TileShape::kK / TileShape::WarpTile::at(I2);
+        constexpr index_t K_Lane = 64 / TileShape::WarpTile::at(I1);
+        constexpr index_t N_Lane = TileShape::WarpTile::at(I1);
+
+        constexpr index_t NWavePerBlk = N_Warp;
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<>,                                       // ?
+                tuple<sequence<NWavePerBlk>,                      // second direction
+                      sequence<K_Lane, N_Lane, N_Pack * K_Pack>>, // first
+                                                                  // direction
+                // wave in blk,     // thd in wave
+                // <M, K>           // <M, K>
+                tuple<sequence<1>, sequence<2, 2>>, // which direction
+                tuple<sequence<0>, sequence<0, 1>>, // which index
+                // <repeat, vec_load>
+                sequence<2>,
+                sequence<2>>{});
+    }
+};
+
+struct F8xMXF4FlatmmPipelineAgBgCrPolicy : UniversalFlatmmPipelineAgBgCrPolicy
+{
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+    static constexpr auto I2 = number<2>{};
+
+    static constexpr index_t kDramLoadPackBytes = 128;
+
+    static constexpr int MXdlPack = 2;
+    static constexpr int NXdlPack = 2;
+    static constexpr int KXdlPack = 2;
+
+    template <typename Problem>
+    static inline constexpr auto wg_attr_num_access = WGAttrNumAccessEnum::Single;
+    //     std::is_same_v<remove_cvref_t<typename Problem::ADataType>, pk_fp4_t>
+    //         ? WGAttrNumAccessEnum::Single
+    //         : WGAttrNumAccessEnum::Double;
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockFlatmm()
+    {
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+
+        using BlockWarps        = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile          = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm          = WarpGemmDispatcher< //
+            ADataType,
+            BDataType,
+            typename Problem::CDataType,
+            WarpTile::at(I0),
+            WarpTile::at(I1),
+            WarpTile::at(I2),
+            Problem::TransposeC,
+            false,
+            false,
+            wg_attr_num_access<Problem>>;
+        using BlockFlatmmPolicy = BlockFlatmmASmemBSmemCRegV1CustomPolicy< //
+            ADataType,
+            BDataType,
+            typename Problem::CDataType,
+            BlockWarps,
+            WarpGemm>;
+        return BlockFlatmmASmemBSmemCRegV1<Problem, BlockFlatmmPolicy>{};
+    }
+
+    template <typename Problem, typename TensorView>
+    CK_TILE_DEVICE static constexpr auto
+    MakeMXFP4_AAsyncLoadDramDescriptor(const TensorView& naive_view)
+    {
+        using ADataType           = remove_cvref_t<typename Problem::ADataType>;
+        using ALayout             = remove_cvref_t<typename Problem::ALayout>;
+        constexpr index_t MPerXdl = Problem::BlockGemmShape::WarpTile::at(I0);
+        constexpr index_t NPerXdl = Problem::BlockGemmShape::WarpTile::at(I1);
+        static_assert(MPerXdl == 16 && NPerXdl == 16);
+        static_assert(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>);
+
+        const auto& naive_desc = naive_view.get_tensor_descriptor();
+        constexpr auto ndims   = remove_cvref_t<decltype(naive_desc)>::get_num_of_dimension();
+        static_assert(ndims == 2, "only support 2D tensor");
+        const auto rows = naive_desc.get_length(number<0>{});
+        const auto cols = naive_desc.get_length(number<1>{});
+
+        constexpr index_t APackedSize = numeric_traits<ADataType>::PackedSize;
+        constexpr index_t K2          = GetSmemPackA<Problem>() * APackedSize; // f4=32; f8=16
+        constexpr index_t K1          = kDramLoadPackBytes * APackedSize / K2; // 8
+        const index_t K0              = cols / (K1 * K2);
+        const auto col_lens           = make_tuple(K0, number<K1>{}, number<K2>{});
+
+        constexpr index_t M1 = 4; // so that we can use imm offset to load lds
+        const index_t M0     = rows / M1;
+        const auto row_lens  = make_tuple(M0, number<M1>{});
+
+        const auto desc_0 =
+            make_naive_tensor_descriptor_packed(container_concat(row_lens, col_lens));
+        const auto desc_1 = transform_tensor_descriptor(
+            desc_0,
+            make_tuple(make_pass_through_transform(M0),
+                       make_xor_transform(make_tuple(number<M1>{}, number<K1>{})),
+                       make_pass_through_transform(K0),
+                       make_pass_through_transform(number<K2>{})),
+            make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}, sequence<4>{}),
+            make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}, sequence<4>{}));
+        const auto desc = transform_tensor_descriptor( //
+            desc_1,
+            make_tuple(make_merge_transform_v3_division_mod(row_lens),
+                       make_merge_transform_v3_division_mod(col_lens)),
+            make_tuple(sequence<0, 1>{}, sequence<2, 3, 4>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+        // printf("A async load dram desc %d x %d: \n", desc.get_length(I0), desc.get_length(I1));
+
+        return tensor_view<typename TensorView::buffer_view,
+                           remove_cvref_t<decltype(desc)>,
+                           TensorView::DstInMemOp>{naive_view.buf_, desc};
+    }
+
+    template <typename Problem, typename TensorView>
+    CK_TILE_DEVICE static constexpr auto
+    Make_F8AAsyncLoadDramDescriptor(const TensorView& naive_view)
+    {
+        constexpr int DynamicTileOffsetFlag = 0;
+
+        constexpr index_t MPerXdl = Problem::BlockGemmShape::WarpTile::at(I0);
+        constexpr index_t NPerXdl = Problem::BlockGemmShape::WarpTile::at(I1);
+
+        static_assert(MPerXdl == 16 && NPerXdl == 16);
+
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t KPack     = GetSmemPackA<Problem>();
+
+        constexpr int ContiguousThreadsCntInDS_READ_16B = 4;
+
+        // implement swizzle pattern on global side
+        // because we can't adjust the ds_write pattern of BUFFER_LOAD_LDS.
+        auto swizzle_a_dram_view_1 = transform_tensor_view(
+            naive_view,
+            make_tuple(
+                // M-dim is not affected by swizzle pattern
+                make_unmerge_transform(
+                    make_tuple(number<DynamicTileOffsetFlag>{}, number<MPerBlock>{})),
+                // K-dim is the swizzle dimension
+                make_unmerge_transform(make_tuple(number<DynamicTileOffsetFlag>{},
+                                                  number<KPerBlock / KPack>{},
+                                                  number<KPack>{}))),
+            make_tuple(sequence<0>{}, sequence<1>{}),
+            make_tuple(sequence<0, 1>{}, sequence<2, 3, 4>{}));
+
+        auto swizzle_a_dram_view_2 = transform_tensor_view(
+            swizzle_a_dram_view_1,
+            make_tuple(make_pass_through_transform(number<DynamicTileOffsetFlag>{}),
+                       make_xor_transform(make_tuple(number<MPerBlock>{},
+                                                     number<ContiguousThreadsCntInDS_READ_16B>{})),
+                       make_pass_through_transform(number<DynamicTileOffsetFlag>{}),
+                       make_pass_through_transform(number<KPack>{})),
+            make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}, sequence<4>{}),
+            make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}, sequence<4>{}));
+
+        return transform_tensor_view(
+            swizzle_a_dram_view_2,
+            make_tuple(
+                make_merge_transform_v3_division_mod(
+                    make_tuple(number<DynamicTileOffsetFlag>{}, number<MPerBlock>{})),
+                make_merge_transform_v3_division_mod(make_tuple(number<DynamicTileOffsetFlag>{},
+                                                                number<KPerBlock / KPack>{},
+                                                                number<KPack>{}))),
+            make_tuple(sequence<0, 1>{}, sequence<2, 3, 4>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+    }
+
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeADramTileDistribution()
+    {
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
+        static_assert(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>);
+
+        constexpr index_t BlockSize   = Problem::kBlockSize;
+        constexpr index_t MPerBlock   = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock   = Problem::BlockGemmShape::kK;
+        constexpr index_t APackedSize = numeric_traits<ADataType>::PackedSize;
+
+        constexpr index_t K2 = MPerBlock == 16
+                                   ? GetSmemPackA<Problem>() * APackedSize / 4
+                                   : GetSmemPackA<Problem>() * APackedSize; // f4=32; f8=16
+        constexpr index_t K1 = kDramLoadPackBytes * APackedSize / K2;       // 8
+        constexpr index_t K0 = KPerBlock / (K1 * K2);                       // KPerBlock/256
+
+        constexpr index_t M2 = get_warp_size() / K1;        // 8
+        constexpr index_t M1 = BlockSize / get_warp_size(); // 4
+        constexpr index_t M0 = MPerBlock / (M2 * M1);
+        static_assert(M0 * M1 * M2 == MPerBlock, "M0, M1, M2 must cover whole MPerBlock!");
+        static_assert(K0 * K1 * K2 == KPerBlock, "K0, K1, K2 must cover whole KPerBlock!");
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding< //
+                sequence<1>,
+                tuple<sequence<M0, M1, M2>, sequence<K0, K1, K2>>, // ?,4,8 1,8,32 or 2,8,16
+                tuple<sequence<1>, sequence<1, 2>>,                // M1 M2,K1
+                tuple<sequence<1>, sequence<2, 1>>,
+                sequence<1, 2, 2>, // M0,K0,K2
+                sequence<0, 0, 2>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_DEVICE static constexpr auto MakeMXFP4_ALdsBlockDescriptor()
+    {
+        using ADataType           = remove_cvref_t<typename Problem::ADataType>;
+        using ALayout             = remove_cvref_t<typename Problem::ALayout>;
+        constexpr index_t MPerXdl = Problem::BlockGemmShape::WarpTile::at(I0);
+        constexpr index_t NPerXdl = Problem::BlockGemmShape::WarpTile::at(I1);
+        static_assert(MPerXdl == 16 && NPerXdl == 16);
+        static_assert(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>);
+
+        /*reduce transform layers,compare with old ck*/
+        constexpr index_t MPerBlock   = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock   = Problem::BlockGemmShape::kK;
+        constexpr index_t APackedSize = numeric_traits<ADataType>::PackedSize;
+        constexpr index_t K2          = GetSmemPackA<Problem>() * APackedSize; // f4=32; f8=16
+        constexpr index_t K1          = kDramLoadPackBytes * APackedSize / K2; // 8
+        constexpr index_t K0          = KPerBlock / (K1 * K2);                 // KPerBlock/256
+        static_assert(K0 * K1 * K2 == KPerBlock, "K0, K1, K2 must cover whole KPerBlock!");
+
+        constexpr index_t M3 = 4; // so that we can use imm offset to load lds
+        constexpr index_t M2 = get_warp_size() / K1 / M3;  // 2
+        constexpr index_t M1 = MPerXdl / (M2 * M3);        // 2
+        constexpr index_t M0 = MPerBlock / (M1 * M2 * M3); // MPerBlock/16
+        static_assert(M0 * M1 * M2 * M3 == MPerBlock, "M0, M1, M2, M3 must cover whole MPerBlock!");
+
+        constexpr index_t Pad = 4 * K2; // 4 * 16
+        // constexpr index_t Pad = 0; // 4 * 16
+
+        // TODO: fix lds_a swizzle
+        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<M0>{},
+                       number<M1>{},
+                       number<K0>{},
+                       number<M2>{},
+                       number<M3>{},
+                       number<K1>{},
+                       number<K2>{}),
+            make_tuple(number<M1*(K0 * (M2 * M3 * K1 * K2) + (K0 - 1) * Pad)>{},
+                       number<K0*(M2 * M3 * K1 * K2) + (K0 - 1) * Pad>{},
+                       number<M2 * M3 * K1 * K2 + Pad>{},
+                       number<M3 * K1 * K2>{},
+                       number<K1 * K2>{},
+                       number<K2>{},
+                       number<1>{}),
+            number<K2>{},
+            number<1>{});
+        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+            a_lds_block_desc_0,
+            make_tuple(make_merge_transform_v3_division_mod(
+                           make_tuple(number<M0>{}, number<M1>{}, number<M2>{}, number<M3>{})),
+                       make_merge_transform_v3_division_mod(
+                           make_tuple(number<K0>{}, number<K1>{}, number<K2>{}))),
+            make_tuple(sequence<0, 1, 3, 4>{}, sequence<2, 5, 6>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        // return a_lds_block_desc_permuted;
+        return a_lds_block_desc;
+    }
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeF8_ReadALdsBlockDescriptor()
+    {
+        constexpr index_t MPerXdl = Problem::BlockGemmShape::WarpTile::at(I0);
+        constexpr index_t NPerXdl = Problem::BlockGemmShape::WarpTile::at(I1);
+
+        static_assert(MPerXdl == 16 && NPerXdl == 16);
+
+        /*reduce transform layers,compare with old ck*/
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t KPack     = GetSmemPackA<Problem>();
+
+        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<KPerBlock / KPack>{}, number<MPerBlock>{}, number<KPack>{}),
+            make_tuple(number<KPack>{}, number<KPerBlock>{}, number<1>{}),
+            number<KPack>{},
+            number<1>{});
+
+        constexpr int ContiguousThreadsCntInDS_READ_16B = 4;
+
+        constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+            a_lds_block_desc_0,
+            make_tuple(make_xor_transform(make_tuple(number<MPerBlock>{},
+                                                     number<ContiguousThreadsCntInDS_READ_16B>{})),
+                       make_pass_through_transform(number<KPack>{})),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}));
+
+        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+            a_lds_block_desc_permuted,
+            make_tuple(make_pass_through_transform(number<MPerBlock>{}),
+                       make_merge_transform_v3_division_mod(
+                           make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
+            make_tuple(sequence<1>{}, sequence<0, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return a_lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeF8_WriteALdsBlockDescriptor()
+    {
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t KPack     = GetSmemPackA<Problem>();
+        return make_naive_tensor_descriptor(make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+                                            make_tuple(number<KPerBlock>{}, number<1>{}),
+                                            number<KPack>{},
+                                            number<1>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeMXF4_ALDS_TileDistribution()
+    {
+        using TileShape = typename Problem::BlockGemmShape;
+
+        static_assert(TileShape::WarpTile::at(I1) == 16, "requires XDL_N == 16");
+        static_assert(TileShape::BlockWarps::at(I0) == 1, "requires Wave_M == 1");
+
+        constexpr int M_warps = TileShape::BlockWarps::at(number<0>{});
+        constexpr int N_warps = TileShape::BlockWarps::at(number<1>{});
+        constexpr int M_Lane  = TileShape::WarpTile::at(I0); // 16
+
+        constexpr int K_Lane = 64 / M_Lane; // 4
+
+        constexpr int K_Thread = TileShape::WarpTile::at(I2) / K_Lane; // 32
+        // constexpr index_t num_access_v = static_cast<index_t>(wg_attr_num_access<Problem>);
+        constexpr index_t num_access_v = 2;
+        constexpr int K1               = K_Thread / num_access_v; // 16
+
+        return make_static_tile_distribution(
+            std::conditional_t<
+                num_access_v == 1,
+                tile_distribution_encoding<
+                    sequence<N_warps>,
+                    tuple<sequence<M_warps, MXdlPack, M_Lane>, sequence<K_Lane, K1>>,
+                    tuple<sequence<1, 0>, sequence<2, 1>>,
+                    tuple<sequence<0, 0>, sequence<0, 2>>,
+                    sequence<2>,
+                    sequence<1>>,
+                tile_distribution_encoding< //
+                    sequence<N_warps>,
+                    tuple<sequence<M_warps, MXdlPack, M_Lane>, sequence<num_access_v, K_Lane, K1>>,
+                    tuple<sequence<1, 0>, sequence<2, 1>>,
+                    tuple<sequence<0, 0>, sequence<1, 2>>,
+                    sequence<2, 2>,
+                    sequence<0, 2>>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeMXFP4_BFlatDramTileDistribution()
+    {
+        using TileShape = typename Problem::BlockGemmShape;
+
+        static_assert(TileShape::WarpTile::at(I1) == 16, "only for XDL_N == 16");
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+        constexpr index_t WaveSize  = get_warp_size();
+        constexpr index_t WaveNum   = BlockSize / WaveSize;
+
+        constexpr index_t K1          = WaveSize; // threads cnt in K dim
+        constexpr index_t KWavePerBlk = 1;
+        constexpr index_t K0          = KWavePerBlk;
+
+        constexpr index_t NWavePerBlk = TileShape::BlockWarps::at(number<1>{}); // N_Warp
+
+        constexpr index_t WaveRepeat   = WaveNum / TileShape::flatNPerWarp;
+        constexpr index_t kKPerThread  = 32;
+        constexpr index_t num_access_v = static_cast<index_t>(wg_attr_num_access<Problem>);
+        constexpr index_t K2           = kKPerThread / num_access_v;
+
+        return make_static_tile_distribution(
+            std::conditional_t< //
+                num_access_v == 1,
+                tile_distribution_encoding< //
+                    sequence<WaveRepeat>,
+                    tuple<sequence<NWavePerBlk, NXdlPack>, // 4 2
+                          sequence<K0, K1, K2>>,           // 1 64 32
+                    tuple<sequence<0, 1, 2>, sequence<2>>,
+                    tuple<sequence<0, 0, 0>, sequence<1>>,
+                    sequence<2>,
+                    sequence<2>>,
+                tile_distribution_encoding< //
+                    sequence<WaveRepeat>,
+                    tuple<sequence<NWavePerBlk, NXdlPack>,     // 4 2
+                          sequence<num_access_v, K0, K1, K2>>, // 2 1 64 16
+                    tuple<sequence<0, 1, 2>, sequence<2>>,
+                    tuple<sequence<0, 0, 1>, sequence<2>>,
+                    sequence<2, 2>,
+                    sequence<0, 3>>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeMXFP4_ScaleA_DramTileDistribution()
+    {
+        using TileShape = typename Problem::BlockGemmShape; // ck_tile::TileFlatmmShape
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+        constexpr index_t WaveSize  = get_warp_size();
+        constexpr index_t WaveNum   = BlockSize / WaveSize;
+
+        constexpr index_t kMPerBlock = TileShape::BlockTile::at(I0);
+
+        constexpr index_t M_Warps = TileShape::BlockWarps::at(I0);
+        constexpr index_t N_Warps = TileShape::BlockWarps::at(I1);
+
+        static_assert(WaveNum == M_Warps * N_Warps, "Block warps do not match block size");
+
+        constexpr index_t M_Lanes = TileShape::WarpTile::at(I0);
+        constexpr index_t K_Lanes = 64 / M_Lanes;
+
+        // Y dimension (M) decomposition
+        constexpr index_t Y2 = M_Lanes;
+        constexpr index_t Y1 = M_Warps;
+        constexpr index_t Y0 = kMPerBlock / (MXdlPack * Y1 * Y2);
+
+        // X dimension (K) decomposition
+        constexpr index_t X0 = K_Lanes;
+        constexpr index_t X1 = 1; // packed 2x2 E8M0 data into 1 int32_t for load
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<N_Warps>, // repeat N_warps
+                                       tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
+                                       tuple<sequence<1, 0>, sequence<2, 1>>,
+                                       tuple<sequence<1, 0>, sequence<0, 2>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeMXFP4_ScaleB_DramTileDistribution()
+    {
+        using TileShape = typename Problem::BlockGemmShape; // ck_tile::TileFlatmmShape
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+        constexpr index_t WaveSize  = get_warp_size();
+        constexpr index_t WaveNum   = BlockSize / WaveSize;
+
+        constexpr index_t kNPerBlock = TileShape::BlockTile::at(I1);
+
+        constexpr index_t M_Warps = TileShape::BlockWarps::at(I0);
+        constexpr index_t N_Warps = TileShape::BlockWarps::at(I1);
+
+        static_assert(WaveNum == M_Warps * N_Warps, "Block warps do not match block size");
+
+        constexpr index_t N_Lanes = TileShape::WarpTile::at(I1);
+        constexpr index_t K_Lanes = 64 / N_Lanes;
+
+        // Y dimension (M) decomposition
+        constexpr index_t Y2 = N_Lanes;
+        constexpr index_t Y1 = N_Warps;
+        constexpr index_t Y0 = kNPerBlock / (NXdlPack * Y1 * Y2);
+
+        // X dimension (K) decomposition
+        constexpr index_t X0 = K_Lanes;
+        constexpr index_t X1 = 1; // packed 2x2 E8M0 data into 1 int32_t for load
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<M_Warps>, // ?
+                                       tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
+                                       tuple<sequence<0, 1>, sequence<2, 1>>,
+                                       tuple<sequence<0, 1>, sequence<0, 2>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeMXFP4_ScaleA_FlatDramTileDistribution()
+    {
+        using TileShape = typename Problem::BlockGemmShape;
+
+        constexpr index_t M_Warp      = TileShape::BlockWarps::at(number<0>{});
+        constexpr index_t K_Lane      = 64 / TileShape::WarpTile::at(I0);
+        constexpr index_t M_Lane      = TileShape::WarpTile::at(I0);
+        constexpr index_t N_Wrap      = TileShape::BlockWarps::at(number<1>{});
+        constexpr index_t MWavePerBlk = M_Warp;
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<N_Wrap>,                      // ?
+                                       tuple<sequence<MWavePerBlk, M_Lane>,   // second direction
+                                             sequence<K_Lane, 1>>,            // first direction
+                                       tuple<sequence<1, 0>, sequence<2, 1>>, // which direction
+                                       tuple<sequence<0, 0>, sequence<0, 1>>, // which index
+                                       // <repeat, vec_load>
+                                       sequence<2>,
+                                       sequence<1>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeMXFP4_ScaleB_FlatDramTileDistribution()
+    {
+        using TileShape = typename Problem::BlockGemmShape;
+
+        constexpr index_t N_Warp      = TileShape::BlockWarps::at(number<1>{});
+        constexpr index_t K_Lane      = 64 / TileShape::WarpTile::at(I1);
+        constexpr index_t N_Lane      = TileShape::WarpTile::at(I1);
+        constexpr index_t M_Wrap      = TileShape::BlockWarps::at(number<0>{});
+        constexpr index_t NWavePerBlk = N_Warp;
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<M_Wrap>,                      // ?
+                                       tuple<sequence<NWavePerBlk, N_Lane>,   // second direction
+                                             sequence<K_Lane, 1>>,            // first direction
+                                       tuple<sequence<0, 1>, sequence<2, 1>>, // which direction
+                                       tuple<sequence<0, 0>, sequence<0, 1>>, // which index
+                                       // <repeat, vec_load>
+                                       sequence<2>,
+                                       sequence<1>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeA()
+    {
+        using ADataType               = remove_cvref_t<typename Problem::ADataType>;
+        constexpr index_t APackedSize = numeric_traits<ADataType>::PackedSize;
+        return sizeof(ADataType) *
+               MakeMXFP4_ALdsBlockDescriptor<Problem>().get_element_space_size() / APackedSize;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return GetSmemSizeA<Problem>();
+    }
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/flatmm/pipeline/moe_flatmm_pipeline_agmem_bgmem_creg.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/moe_flatmm_pipeline_agmem_bgmem_creg.hpp
--- a/include/ck_tile/ops/flatmm/pipeline/mx_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/mx_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
--- a/include/ck_tile/ops/flatmm/pipeline/mx_flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/mx_flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -0,0 +1,456 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp"
+
+namespace ck_tile {
+
+namespace detail {
+template <typename Problem>
+struct MXFlatmmPipelineAgBgCrPolicy : UniversalFlatmmPipelineAgBgCrPolicy
+{
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+    static constexpr auto I2 = number<2>{};
+
+    static constexpr index_t kDramLoadPackBytes = 128;
+    static constexpr index_t DWORDx4            = 16;
+    static constexpr index_t DWORDx3            = 12;
+
+    static constexpr int MXdlPack = 2;
+    static constexpr int NXdlPack = 2;
+    static constexpr int KXdlPack = 2;
+
+    private:
+    using ADataType                      = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType                      = remove_cvref_t<typename Problem::BDataType>;
+    static constexpr index_t APackedSize = numeric_traits<ADataType>::PackedSize;
+    static constexpr index_t BPackedSize = numeric_traits<BDataType>::PackedSize;
+
+    using ALayout = remove_cvref_t<typename Problem::ALayout>;
+    static_assert(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>);
+
+    using TileShape                    = typename Problem::BlockGemmShape;
+    using BlockWarps                   = typename TileShape::BlockWarps;
+    static constexpr index_t BlockSize = Problem::kBlockSize;
+    static constexpr index_t WaveSize  = get_warp_size();
+    static constexpr index_t WaveNum   = BlockSize / WaveSize;
+
+    static constexpr index_t MPerBlock = TileShape::kM;
+    static constexpr index_t NPerBlock = TileShape::kN;
+    static constexpr index_t KPerBlock = TileShape::kK;
+    static constexpr index_t MWarps    = BlockWarps::at(I0);
+    static constexpr index_t NWarps    = BlockWarps::at(I1);
+    static_assert(WaveNum == MWarps * NWarps, "Block warps do not match block size");
+
+    static constexpr index_t MPerXdl = TileShape::WarpTile::at(I0);
+    static constexpr index_t NPerXdl = TileShape::WarpTile::at(I1);
+    static constexpr index_t KPerXdl = TileShape::WarpTile::at(I2);
+    static_assert(MPerXdl == 16 && NPerXdl == 16);
+    static constexpr index_t K_Lane   = get_warp_size() / 16; // 4
+    static constexpr index_t K_Thread = KPerXdl / K_Lane;     // 32
+
+    public:
+    static constexpr index_t AK1 = DWORDx4 * APackedSize;
+    static constexpr index_t BK1 = DWORDx4 * BPackedSize;
+
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockFlatmm()
+    {
+        using WarpTile          = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm          = WarpGemmDispatcher< //
+            ADataType,
+            BDataType,
+            typename Problem::CDataType,
+            WarpTile::at(I0),
+            WarpTile::at(I1),
+            WarpTile::at(I2),
+            Problem::TransposeC>;
+        using BlockFlatmmPolicy = BlockFlatmmASmemBSmemCRegV1CustomPolicy< //
+            ADataType,
+            BDataType,
+            typename Problem::CDataType,
+            BlockWarps,
+            WarpGemm>;
+        return BlockFlatmmASmemBSmemCRegV1<Problem, BlockFlatmmPolicy>{};
+    }
+
+    CK_TILE_DEVICE static constexpr auto MakeMX_ABytesDramTileDistribution()
+    {
+        constexpr index_t K2 = std::is_same_v<ADataType, pk_fp6x16_t> ? DWORDx3 : DWORDx4;
+        constexpr index_t K1 = kDramLoadPackBytes / DWORDx4; // fp8/fp6/fp4 K1 equal to 8
+        constexpr index_t K0 =
+            KPerBlock / APackedSize * sizeof(ADataType) / (K1 * K2); // KPerBlock/256/packsize
+
+        constexpr index_t M2 = WaveSize / K1;        // 8
+        constexpr index_t M1 = BlockSize / WaveSize; // 4
+        constexpr index_t M0 = MPerBlock / (M2 * M1);
+        static_assert(M0 * M1 * M2 == MPerBlock, "M0, M1, M2 must cover whole MPerBlock!");
+        static_assert(K0 * K1 * K2 == KPerBlock / APackedSize * sizeof(ADataType),
+                      "K0, K1, K2 must cover whole KPerBlock!");
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding< //
+                sequence<1>,
+                tuple<sequence<M0, M1, M2>, sequence<K0, K1, K2>>, // ?,4,8 1,8,32 or 2,8,16
+                tuple<sequence<1>, sequence<1, 2>>,                // M1 M2,K1
+                tuple<sequence<1>, sequence<2, 1>>,
+                sequence<1, 2, 2>, // M0,K0,K2
+                sequence<0, 0, 2>>{});
+    }
+
+    template <typename WindowTmp>
+    CK_TILE_DEVICE static constexpr auto
+    MakeMX_AAsyncLoadBytesDramWindow(const WindowTmp& window_tmp)
+    {
+        constexpr auto ndims = std::decay_t<decltype(window_tmp)>::get_num_of_dimension();
+        static_assert(ndims == 2, "only support 2D tensor");
+        auto&& tensor_view_tmp  = window_tmp.get_bottom_tensor_view();
+        const auto [rows, cols] = tensor_view_tmp.get_tensor_descriptor().get_lengths();
+
+        constexpr index_t K2 = std::is_same_v<ADataType, pk_fp6x16_t> ? DWORDx3 : DWORDx4;
+        constexpr index_t K1 = kDramLoadPackBytes / DWORDx4; // fp8/fp6/fp4 K1 equal to 8
+        const index_t K0     = cols / (K1 * K2 / sizeof(ADataType) * APackedSize);
+        const auto col_lens  = make_tuple(K0, number<K1>{}, number<K2>{});
+
+        constexpr index_t M1 = 4; // so that we can use imm offset to load lds
+        const index_t M0     = integer_divide_ceil(rows, M1);
+        const auto row_lens  = make_tuple(M0, number<M1>{});
+
+        const auto d0 = make_naive_tensor_descriptor_packed(container_concat(row_lens, col_lens));
+        const auto desc_0 = decltype(d0)( // set correct size (without padding)
+            d0.get_transforms(),
+            tensor_view_tmp.get_tensor_descriptor().get_element_space_size());
+        const auto desc_1 = transform_tensor_descriptor(
+            desc_0,
+            make_tuple(make_pass_through_transform(M0),
+                       make_xor_transform(make_tuple(number<M1>{}, number<K1>{})),
+                       make_pass_through_transform(K0),
+                       make_pass_through_transform(number<K2>{})),
+            make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}, sequence<4>{}),
+            make_tuple(sequence<0>{}, sequence<1, 3>{}, sequence<2>{}, sequence<4>{}));
+        const auto desc = transform_tensor_descriptor( //
+            desc_1,
+            make_tuple(make_merge_transform_v3_division_mod(row_lens),
+                       make_merge_transform_v3_division_mod(col_lens)),
+            make_tuple(sequence<0, 1>{}, sequence<2, 3, 4>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        auto&& byte_ptr = reinterpret_cast<const uint8_t*>(&(tensor_view_tmp.get_buffer_view()(0)));
+        auto&& byte_tensor_view = make_tensor_view<address_space_enum::global>(byte_ptr, desc);
+
+        auto&& origin_tmp       = window_tmp.get_window_origin();
+        constexpr index_t test1 = APackedSize / sizeof(ADataType);
+        return make_tile_window(byte_tensor_view,
+                                make_tuple(number<MPerBlock>{}, number<KPerBlock / test1>{}),
+                                {origin_tmp[0], origin_tmp[1] / test1},
+                                MakeMX_ABytesDramTileDistribution());
+    }
+
+    CK_TILE_DEVICE static constexpr auto MakeMX_ALdsBytesBlockDescriptor()
+    {
+        constexpr index_t K2 = std::is_same_v<ADataType, pk_fp6x16_t> ? DWORDx3 : AK1 / APackedSize;
+        constexpr index_t K2_Pad = 16;
+        constexpr index_t K1     = kDramLoadPackBytes / DWORDx4; // 8
+        constexpr index_t K0     = std::is_same_v<ADataType, pk_fp6x16_t>
+                                       ? KPerBlock / (K1 * K2 / sizeof(ADataType) * APackedSize)
+                                       : KPerBlock / (K1 * AK1); // KPerBlock/256
+        static_assert(K0 * K1 * K2 / sizeof(ADataType) * APackedSize == KPerBlock,
+                      "K0, K1, K2 must cover whole KPerBlock!");
+
+        constexpr index_t M3 = 4;                   // so that we can use imm offset to load lds
+        constexpr index_t M2 = WaveSize / K1 / M3;  // 2
+        constexpr index_t M1 = MPerXdl / (M2 * M3); // 2
+        constexpr index_t M0 = MPerBlock / (M1 * M2 * M3); // MPerBlock/16
+        static_assert(M0 * M1 * M2 * M3 == MPerBlock, "M0, M1, M2, M3 must cover whole MPerBlock!");
+
+        constexpr index_t Pad = 4 * K2; // 4 dwords
+
+        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor( //
+            make_tuple(number<M0>{},
+                       number<K0>{},
+                       number<M1>{},
+                       number<M2>{},
+                       number<M3>{},
+                       number<K1>{},
+                       number<K2>{}),
+            make_tuple(number<K0*(M1 * (M2 * M3 * K1 * K2_Pad) + (M1 - 1) * Pad)>{},
+                       number<M1*(M2 * M3 * K1 * K2_Pad) + (M1 - 1) * Pad>{},
+                       number<M2 * M3 * K1 * K2_Pad + Pad>{},
+                       number<M3 * K1 * K2_Pad>{},
+                       number<K1 * K2_Pad>{},
+                       number<K2_Pad>{},
+                       number<1>{}),
+            number<K2>{},
+            number<1>{});
+
+        constexpr auto a_lds_block_desc_1 = transform_tensor_descriptor(
+            a_lds_block_desc_0,
+            make_tuple(make_pass_through_transform(M0),
+                       make_pass_through_transform(K0),
+                       make_pass_through_transform(M1),
+                       make_pass_through_transform(M2),
+                       make_xor_transform(make_tuple(number<M3>{}, number<K1>{})),
+                       make_pass_through_transform(number<K2>{})),
+            make_tuple(sequence<0>{},
+                       sequence<1>{},
+                       sequence<2>{},
+                       sequence<3>{},
+                       sequence<4, 5>{},
+                       sequence<6>{}),
+            make_tuple(sequence<0>{},
+                       sequence<1>{},
+                       sequence<2>{},
+                       sequence<3>{},
+                       sequence<4, 5>{},
+                       sequence<6>{}));
+        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+            a_lds_block_desc_1,
+            make_tuple(make_merge_transform_v3_division_mod(
+                           make_tuple(number<M0>{}, number<M1>{}, number<M2>{}, number<M3>{})),
+                       make_merge_transform_v3_division_mod(
+                           make_tuple(number<K0>{}, number<K1>{}, number<K2>{}))),
+            make_tuple(sequence<0, 2, 3, 4>{}, sequence<1, 5, 6>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        // return a_lds_block_desc_permuted;
+        return a_lds_block_desc;
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto MakeMX_ALDSBytes_TileDistribution()
+    {
+        static_assert(BlockWarps::at(I0) == 1, "requires Wave_M == 1");
+
+        if constexpr(std::is_same_v<ADataType, pk_fp4_t>)
+            return make_static_tile_distribution(
+                tile_distribution_encoding< //
+                    sequence<NWarps>,
+                    tuple<sequence<MWarps, MXdlPack, MPerXdl>, sequence<K_Lane, AK1 / APackedSize>>,
+                    tuple<sequence<1, 0>, sequence<2, 1>>,
+                    tuple<sequence<0, 0>, sequence<0, 2>>,
+                    sequence<2>,
+                    sequence<1>>{});
+        else if constexpr(std::is_same_v<ADataType, fp8_t>)
+            return make_static_tile_distribution(
+                tile_distribution_encoding<
+                    sequence<NWarps>,
+                    tuple<sequence<MWarps, MXdlPack, MPerXdl>,
+                          sequence<K_Thread / AK1, K_Lane, AK1 / APackedSize>>,
+                    tuple<sequence<1, 0>, sequence<2, 1>>,
+                    tuple<sequence<0, 0>, sequence<1, 2>>,
+                    sequence<2, 2>,
+                    sequence<0, 2>>{});
+        else if constexpr(std::is_same_v<ADataType, pk_fp6x16_t>)
+            // K_Lane=4, K_Thread=32
+            return make_static_tile_distribution(
+                tile_distribution_encoding< //
+                    sequence<NWarps>,
+                    tuple<sequence<MWarps, MXdlPack, MPerXdl>,
+                          sequence<K_Lane, KPerXdl / (K_Lane * APackedSize), DWORDx3>>,
+                    tuple<sequence<1, 0>, sequence<2, 1>>,
+                    tuple<sequence<0, 0>, sequence<0, 2>>,
+                    sequence<2, 2>,
+                    sequence<1, 2>>{});
+        else
+            static_assert(false, "unsupported datatype");
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto MakeMX_BFlatBytesDramTileDistribution()
+    {
+        constexpr index_t K1          = WaveSize; // threads cnt in K dim
+        constexpr index_t KWavePerBlk = 1;
+        constexpr index_t K0          = KWavePerBlk;
+
+        constexpr index_t WaveRepeat = WaveNum / TileShape::flatNPerWarp;
+
+        if constexpr(std::is_same_v<BDataType, pk_fp4_t>)
+            return make_static_tile_distribution(
+                tile_distribution_encoding< //
+                    sequence<WaveRepeat>,
+                    tuple<sequence<NWarps, NXdlPack>,           // 4 2
+                          sequence<K0, K1, BK1 / BPackedSize>>, // 1 64 16
+                    tuple<sequence<0, 1, 2>, sequence<2>>,
+                    tuple<sequence<0, 0, 0>, sequence<1>>,
+                    sequence<2>,
+                    sequence<2>>{});
+        else if constexpr(std::is_same_v<BDataType, fp8_t>)
+            return make_static_tile_distribution(
+                tile_distribution_encoding< //
+                    sequence<WaveRepeat>,
+                    tuple<sequence<NWarps, NXdlPack>,                           // 4 2
+                          sequence<K_Thread / BK1, K0, K1, BK1 / BPackedSize>>, // 2 1 64 16
+                    tuple<sequence<0, 1, 2>, sequence<2>>,
+                    tuple<sequence<0, 0, 1>, sequence<2>>,
+                    sequence<2, 2>,
+                    sequence<0, 3>>{});
+        else if constexpr(std::is_same_v<ADataType, pk_fp6x16_t>)
+            return make_static_tile_distribution(
+                tile_distribution_encoding< //
+                    sequence<WaveRepeat>,
+                    tuple<sequence<NWarps, NXdlPack>, // 4 2
+                          sequence<K0,
+                                   K1,
+                                   K_Thread * sizeof(BDataType) / (DWORDx3 * BPackedSize),
+                                   DWORDx3>>, // 64 1 2 12
+                    tuple<sequence<0, 1, 2>, sequence<2>>,
+                    tuple<sequence<0, 0, 0>, sequence<1>>,
+                    sequence<2, 2>,
+                    sequence<2, 3>>{});
+        else
+            static_assert(false, "unsupported datatype");
+    }
+
+    template <typename WindowTmp>
+    CK_TILE_HOST_DEVICE static constexpr auto
+    MakeMX_BFlatBytesDramWindow(const WindowTmp& window_tmp)
+    {
+        constexpr auto M_Warp_Tile  = Problem::BlockGemmShape::WarpTile::at(I1);
+        constexpr auto flatNPerWarp = Problem::BlockGemmShape::flatNPerWarp;
+        constexpr auto flatKPerWarp = Problem::BlockGemmShape::flatKPerWarp;
+
+        static_assert(std::decay_t<decltype(window_tmp)>::get_num_of_dimension() == 2);
+        auto&& tensor_view_tmp          = window_tmp.get_bottom_tensor_view();
+        const auto [flat_n, flat_k]     = tensor_view_tmp.get_tensor_descriptor().get_lengths();
+        constexpr auto flat_k_per_block = KPerBlock * M_Warp_Tile;
+        auto&& byte_tensor_desc         = transform_tensor_descriptor(
+            make_naive_tensor_descriptor_packed(
+                make_tuple(flat_n,
+                           flat_k / flat_k_per_block,
+                           number<flat_k_per_block / BPackedSize * sizeof(BDataType)>{})),
+            make_tuple(make_pass_through_transform(flat_n),
+                       make_merge_transform_v3_division_mod(make_tuple(
+                           flat_k / flat_k_per_block,
+                           number<flat_k_per_block / BPackedSize * sizeof(BDataType)>{}))),
+            make_tuple(sequence<0>{}, sequence<1, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+        auto&& byte_ptr = reinterpret_cast<const uint8_t*>(&(tensor_view_tmp.get_buffer_view()(0)));
+        auto&& byte_tensor_view =
+            make_tensor_view<address_space_enum::global>(byte_ptr, byte_tensor_desc);
+        auto&& origin_tmp = window_tmp.get_window_origin();
+        auto origin_n     = origin_tmp[0];
+        auto origin_k     = static_cast<int>(origin_tmp[1] * sizeof(BDataType) / BPackedSize);
+        return make_tile_window(
+            byte_tensor_view,
+            make_tuple(number<flatNPerWarp>{},
+                       number<flatKPerWarp * sizeof(BDataType) / BPackedSize>{}),
+            {origin_n, origin_k},
+            MakeMX_BFlatBytesDramTileDistribution());
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto MakeMX_ScaleA_DramTileDistribution()
+    {
+        constexpr index_t M_Lanes = TileShape::WarpTile::at(I0);
+        constexpr index_t K_Lanes = 64 / M_Lanes;
+
+        // Y dimension (M) decomposition
+        constexpr index_t Y2 = M_Lanes;
+        constexpr index_t Y1 = MWarps;
+        constexpr index_t Y0 = MPerBlock / (MXdlPack * Y1 * Y2);
+
+        // X dimension (K) decomposition
+        constexpr index_t X0 = K_Lanes;
+        constexpr index_t X1 = 1; // packed 2x2 E8M0 data into 1 int32_t for load
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<NWarps>, // repeat NWarps
+                                       tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
+                                       tuple<sequence<1, 0>, sequence<2, 1>>,
+                                       tuple<sequence<1, 0>, sequence<0, 2>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{});
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto MakeMX_ScaleB_DramTileDistribution()
+    {
+        constexpr index_t N_Lanes = TileShape::WarpTile::at(I1);
+        constexpr index_t K_Lanes = 64 / N_Lanes;
+
+        // Y dimension (M) decomposition
+        constexpr index_t Y2 = N_Lanes;
+        constexpr index_t Y1 = NWarps;
+        constexpr index_t Y0 = NPerBlock / (NXdlPack * Y1 * Y2);
+
+        // X dimension (K) decomposition
+        constexpr index_t X0 = K_Lanes;
+        constexpr index_t X1 = 1; // packed 2x2 E8M0 data into 1 int32_t for load
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<MWarps>, // ?
+                                       tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
+                                       tuple<sequence<0, 1>, sequence<2, 1>>,
+                                       tuple<sequence<0, 1>, sequence<0, 2>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{});
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto MakeMX_ScaleA_FlatDramTileDistribution()
+    {
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<NWarps>,                      // ?
+                                       tuple<sequence<MWarps, MPerXdl>,       // second direction
+                                             sequence<K_Lane, 1>>,            // first direction
+                                       tuple<sequence<1, 0>, sequence<2, 1>>, // which direction
+                                       tuple<sequence<0, 0>, sequence<0, 1>>, // which index
+                                       // <repeat, vec_load>
+                                       sequence<2>,
+                                       sequence<1>>{});
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto MakeMX_ScaleB_FlatDramTileDistribution()
+    {
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<MWarps>,                      // ?
+                                       tuple<sequence<NWarps, NPerXdl>,       // second direction
+                                             sequence<K_Lane, 1>>,            // first direction
+                                       tuple<sequence<0, 1>, sequence<2, 1>>, // which direction
+                                       tuple<sequence<0, 0>, sequence<0, 1>>, // which index
+                                       // <repeat, vec_load>
+                                       sequence<2>,
+                                       sequence<1>>{});
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeA()
+    {
+        if constexpr(!std::is_same_v<ADataType, pk_fp6x16_t>)
+        {
+            return sizeof(ADataType) * MakeMX_ALdsBytesBlockDescriptor().get_element_space_size();
+        }
+        else
+        {
+            return MakeMX_ALdsBytesBlockDescriptor().get_element_space_size();
+        }
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return GetSmemSizeA(); }
+};
+} // namespace detail
+
+struct MXFlatmmPipelineAgBgCrPolicy
+{
+
+#define FORWARD_METHOD_(method)                                                                    \
+    template <typename Problem, typename... Args>                                                  \
+    CK_TILE_HOST_DEVICE static constexpr auto method(Args&&... args)                               \
+    {                                                                                              \
+        return detail::MXFlatmmPipelineAgBgCrPolicy<Problem>::method(std::forward<Args>(args)...); \
+    }
+
+    FORWARD_METHOD_(GetBlockFlatmm);
+    FORWARD_METHOD_(MakeMX_AAsyncLoadBytesDramWindow);
+    FORWARD_METHOD_(MakeMX_ABytesDramTileDistribution);
+    FORWARD_METHOD_(MakeMX_ALdsBytesBlockDescriptor);
+    FORWARD_METHOD_(MakeMX_ALDSBytes_TileDistribution);
+    FORWARD_METHOD_(MakeMX_BFlatBytesDramTileDistribution);
+    FORWARD_METHOD_(MakeMX_BFlatBytesDramWindow);
+    FORWARD_METHOD_(MakeMX_ScaleA_DramTileDistribution);
+    FORWARD_METHOD_(MakeMX_ScaleB_DramTileDistribution);
+    FORWARD_METHOD_(MakeMX_ScaleA_FlatDramTileDistribution);
+    FORWARD_METHOD_(MakeMX_ScaleB_FlatDramTileDistribution);
+    FORWARD_METHOD_(GetSmemSizeA);
+    FORWARD_METHOD_(GetSmemSize);
+
+#undef FORWARD_METHOD_
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/flatmm/pipeline/tile_flatmm_shape.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/tile_flatmm_shape.hpp
@@ -0,0 +1,47 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/concat.hpp"
+
+namespace ck_tile {
+
+template <typename BlockTile_, typename BlockWarps_, typename WarpTile_>
+struct TileFlatmmShape
+{
+    using BlockTile  = remove_cvref_t<BlockTile_>;
+    using BlockWarps = remove_cvref_t<BlockWarps_>;
+    using WarpTile   = remove_cvref_t<WarpTile_>;
+
+    static constexpr auto idxM = number<0>{};
+    static constexpr auto idxN = number<1>{};
+    static constexpr auto idxK = number<2>{};
+
+    static constexpr index_t NumWarps =
+        reduce_on_sequence(BlockWarps{}, multiplies<>{}, number<1>{});
+
+    static constexpr index_t kM = BlockTile::at(idxM);
+    static constexpr index_t kN = BlockTile::at(idxN);
+    static constexpr index_t kK = BlockTile::at(idxK);
+
+    static constexpr index_t flatNPerWarp  = BlockWarps::at(idxN);
+    static constexpr index_t flatKPerWarp  = WarpTile::at(idxK) * WarpTile::at(idxN);
+    static constexpr index_t flatKPerBlock = flatKPerWarp * kK / WarpTile::at(idxK);
+
+    static constexpr bool PermuteA = false;
+    static constexpr bool PermuteB = false;
+
+    CK_TILE_HOST static std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "tile_flatmm_shape",
+                      concat('x', kM, kN, kK, NumWarps),
+                      concat('x', BlockWarps::at(idxM), BlockWarps::at(idxN), BlockWarps::at(idxK)),
+                      concat('x', (WarpTile::at(idxM)), WarpTile::at(idxN), WarpTile::at(idxK)));
+        // clang-format on
+    }
+};
+
+} // namespace ck_tile