Preshuffle AQ matrix in block scale gemm (#2624)

* Preshuffle AQ matrix in block scale gemm * turns the output to fp16. Increase the repetition time. --------- Co-authored-by: ThomasNing <thomas.ning@amd.com>
2026-04-19 22:39:03 +00:00 · 2025-08-12 22:32:51 -06:00
parent 0f42a92fc1
commit 452791a3ba
13 changed files with 667 additions and 228 deletions
--- a/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
+++ b/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
@@ -8,6 +8,9 @@ list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion
 if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
    add_executable(tile_example_gemm_aquant_basic EXCLUDE_FROM_ALL gemm_aquant_basic.cpp)
    target_compile_options(tile_example_gemm_aquant_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+
+    add_executable(tile_example_gemm_aquant_preshuffle EXCLUDE_FROM_ALL gemm_aquant_preshuffle.cpp)
+    target_compile_options(tile_example_gemm_aquant_preshuffle PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 else()
    message(DEBUG "Skipping ck_tile quant gemm tests for current target")
 endif()
--- a/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp
+++ b/example/ck_tile/38_block_scale_gemm/gemm_aquant_basic.cpp
@@ -21,7 +21,8 @@ template <typename ADataType,
          typename ALayout,
          typename BLayout,
          typename CLayout,
-          uint32_t QuantGroupSize>
+          uint32_t QuantGroupSize,
+          bool Preshuffle = false>
 float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::stream_config& s)
 {
    constexpr bool kPadM = false;
@@ -52,7 +53,7 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s
    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;

    using CodegenGemmTraits =
-        ck_tile::TileGemmAQuantTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+        ck_tile::TileGemmAQuantTraits<kPadM, kPadN, kPadK, Preshuffle, ALayout, BLayout, CLayout>;

    using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<ADataType,
                                                                 BDataType,
@@ -144,7 +145,7 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s

 #include "run_gemm_aquant_example.inc"

-template <typename TypeConfig, uint32_t QuantGroupSize>
+template <typename GemmConfig, typename TypeConfig, uint32_t QuantGroupSize>
 int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
 {
    using Row = ck_tile::tensor_layout::gemm::RowMajor;
@@ -156,7 +157,7 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
    {
        if(a_layout == "R" && b_layout == "C")
        {
-            return run_gemm_example_with_layouts<TypeConfig, QuantGroupSize>(
+            return run_gemm_example_with_layouts<GemmConfig, TypeConfig, QuantGroupSize>(
                argc, argv, Row{}, Row{}, Col{}, Row{});
        }
        else
@@ -172,6 +173,7 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
    return 0;
 }

+template <template <typename PreType> typename GemmConfig>
 int run_gemm_example(int argc, char* argv[])
 {
    auto [result, arg_parser] = create_args(argc, argv);
@@ -186,12 +188,14 @@ int run_gemm_example(int argc, char* argv[])
    {
        using TypeConfig =
            decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>{});
-        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
    }
    else if(data_type == "bf8")
    {
        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, float>{});
-        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
    }
    else if(data_type == "i4fp8")
    {
@@ -199,7 +203,8 @@ int run_gemm_example(int argc, char* argv[])
                                                        ck_tile::fp8_t,
                                                        float,
                                                        ck_tile::fp8_t>{});
-        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
    }
    else if(data_type == "i4bf8")
    {
@@ -207,19 +212,22 @@ int run_gemm_example(int argc, char* argv[])
                                                        ck_tile::bf8_t,
                                                        float,
                                                        ck_tile::bf8_t>{});
-        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
    }
    else if(data_type == "i4f32fp8")
    {
        using TypeConfig =
            decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, float>{});
-        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
    }
    else if(data_type == "i4f32bf8")
    {
        using TypeConfig =
            decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, float>{});
-        return run_gemm_example_prec_type<TypeConfig, 128>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
    }
    else
    {
@@ -227,4 +235,4 @@ int run_gemm_example(int argc, char* argv[])
    }
 }

-int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
+int main(int argc, char* argv[]) { return !run_gemm_example<GemmConfigComputeV3>(argc, argv); }
--- a/example/ck_tile/38_block_scale_gemm/gemm_aquant_preshuffle.cpp
+++ b/example/ck_tile/38_block_scale_gemm/gemm_aquant_preshuffle.cpp
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/core/config.hpp"
+#include "ck_tile/host.hpp"
+#include "gemm_utils.hpp"
+
+template <typename ADataType,
+          typename AQDataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ComputeDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          uint32_t QuantGroupSize,
+          bool Preshuffle = false>
+float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::stream_config& s)
+{
+    constexpr bool kPadM = false;
+    constexpr bool kPadN = false;
+    constexpr bool kPadK = false;
+
+    constexpr int kBlockPerCu = 1;
+
+    static_assert(std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::RowMajor>);
+
+    constexpr ck_tile::index_t M_Tile = 16;
+    constexpr ck_tile::index_t N_Tile = 64;
+    constexpr ck_tile::index_t K_Tile = 256;
+
+    constexpr ck_tile::index_t M_Warp = 1;
+    constexpr ck_tile::index_t N_Warp = 4;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 16;
+    constexpr ck_tile::index_t N_Warp_Tile = 16;
+    constexpr ck_tile::index_t K_Warp_Tile = 32;
+
+    using CodegenGemmShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
+
+    using CodegenGemmTraits =
+        ck_tile::TileGemmAQuantTraits<kPadM, kPadN, kPadK, Preshuffle, ALayout, BLayout, CLayout>;
+
+    using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<ADataType,
+                                                                 BDataType,
+                                                                 AccDataType,
+                                                                 CodegenGemmShape,
+                                                                 CodegenGemmTraits,
+                                                                 ComputeDataType>;
+
+    using BaseGemmPipeline = ck_tile::BaseAQuantGemmPipelineAgBgCrCompV3<GemmPipelineProblem>;
+
+    const ck_tile::index_t K_split      = (args.K + K_Tile - 1) / K_Tile * K_Tile;
+    const ck_tile::index_t num_loop     = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop             = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num  = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+    constexpr bool transposed_warp_gemm = false;
+
+    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
+        constexpr bool has_hot_loop_v = has_hot_loop_.value;
+        constexpr auto tail_number_v  = tail_number_.value;
+
+        using CodegenPipelineProblem =
+            ck_tile::GemmAQuantPipelineProblem<ADataType,
+                                               AQDataType,
+                                               BDataType,
+                                               AccDataType,
+                                               CodegenGemmShape,
+                                               CodegenGemmTraits,
+                                               QuantGroupSize,
+                                               ComputeDataType,
+                                               ck_tile::GemmPipelineScheduler::Intrawave,
+                                               has_hot_loop_v,
+                                               tail_number_v>;
+        using CodegenGemmPipeline = ck_tile::AQuantGemmPipelineAgBgCrCompV3<CodegenPipelineProblem>;
+        using GemmEpilogue        = ck_tile::CShuffleEpilogue<
+                   ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                    BDataType,
+                                                    ck_tile::tuple<>,
+                                                    AccDataType,
+                                                    CDataType,
+                                                    ck_tile::tuple<>,
+                                                    CLayout,
+                                                    ck_tile::element_wise::PassThrough,
+                                                    CodegenPipelineProblem::kBlockSize,
+                                                    TilePartitioner::MPerBlock,
+                                                    TilePartitioner::NPerBlock,
+                                                    M_Warp,
+                                                    N_Warp,
+                                                    M_Warp_Tile,
+                                                    N_Warp_Tile,
+                                                    K_Warp_Tile,
+                                                    transposed_warp_gemm,
+                                                    ck_tile::memory_operation_enum::set>>;
+        using Kernel =
+            ck_tile::AQuantGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
+
+        auto kargs = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(args.k_batch != 1)
+        {
+            throw std::runtime_error("split-k is not supported yet!");
+        }
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << CodegenGemmShape::GetName() << '\n'
+                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                      << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        float ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    };
+    return BaseGemmPipeline::TailHandler(Run, has_hot_loop, tail_num);
+}
+
+#include "run_gemm_aquant_example.inc"
+
+template <typename GemmConfig, typename TypeConfig, uint32_t QuantGroupSize>
+int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+{
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    if constexpr(std::is_same_v<typename TypeConfig::ADataType, ck_tile::pk_int4_t> ||
+                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::fp8_t> ||
+                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::bf8_t>)
+    {
+        if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts<GemmConfig, TypeConfig, QuantGroupSize>(
+                argc, argv, Row{}, Row{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices!");
+        }
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for A.");
+    }
+
+    return 0;
+}
+
+template <template <typename PreType> typename GemmConfig>
+int run_gemm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+
+    if(data_type == "fp8")
+    {
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "bf8")
+    {
+        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, float>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "i4fp8")
+    {
+        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t,
+                                                        ck_tile::fp8_t,
+                                                        float,
+                                                        ck_tile::fp8_t>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "i4bf8")
+    {
+        using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t,
+                                                        ck_tile::bf8_t,
+                                                        float,
+                                                        ck_tile::bf8_t>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "i4f32fp8")
+    {
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, float>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "i4f32bf8")
+    {
+        using TypeConfig =
+            decltype(GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, float>{});
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::pk_int4_t>, TypeConfig, 128>(
+            a_layout, b_layout, argc, argv);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+}
+
+int main(int argc, char* argv[]) { return !run_gemm_example<GemmConfigPreshufle_AQ>(argc, argv); }
--- a/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
+++ b/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
@@ -35,7 +35,7 @@ constexpr ck_tile::index_t get_k_warp_tile()
 #endif
 }
 template <typename PrecType, ck_tile::index_t M_Warp_Tile>
-constexpr ck_tile::index_t get_k_warp_tile_flatmm()
+constexpr ck_tile::index_t get_k_from_preshuffled_warp_tile()
 {
 #if defined(__gfx950__)
    if constexpr(M_Warp_Tile == 32)
@@ -138,7 +138,7 @@ struct GemmConfigComputeV3 : public GemmConfigBase
    // Compute V3 only support Intrawave scheduler
    static constexpr ck_tile::index_t M_Tile = 32;
    static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);

    static constexpr ck_tile::index_t M_Warp = 1;
    static constexpr ck_tile::index_t N_Warp = 4;
@@ -265,7 +265,8 @@ struct GemmConfigPreshufle_1 : public GemmConfigBase

    static constexpr ck_tile::index_t M_Warp_Tile = 32;
    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
+    static constexpr ck_tile::index_t K_Warp_Tile =
+        get_k_from_preshuffled_warp_tile<PrecType, M_Warp_Tile>();

    static constexpr int kBlockPerCu           = 2;
    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
@@ -287,7 +288,8 @@ struct GemmConfigPreshufle_2 : public GemmConfigBase

    static constexpr ck_tile::index_t M_Warp_Tile = 16;
    static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
+    static constexpr ck_tile::index_t K_Warp_Tile =
+        get_k_from_preshuffled_warp_tile<PrecType, M_Warp_Tile>();

    static constexpr int kBlockPerCu           = 2;
    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
@@ -296,62 +298,25 @@ struct GemmConfigPreshufle_2 : public GemmConfigBase
    static constexpr bool DoubleSmemBuffer     = false;
 };

-template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
-struct GemmTypeConfig;
-
-template <>
-struct GemmTypeConfig<ck_tile::half_t>
+template <typename PrecType>
+struct GemmConfigPreshufle_AQ : public GemmConfigBase
 {
-    using ADataType   = ck_tile::half_t;
-    using BDataType   = ck_tile::half_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-    // ToDo: Add more bias config to support different categories of GEMM.
-};
+    static constexpr ck_tile::index_t M_Tile = 16;
+    static constexpr ck_tile::index_t N_Tile = 64;
+    static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);

-template <>
-struct GemmTypeConfig<ck_tile::bf16_t, ck_tile::bf16_t, ck_tile::bf16_t>
-{
-    using ADataType   = ck_tile::bf16_t;
-    using BDataType   = ck_tile::bf16_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::bf16_t;
-};
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;

-template <>
-struct GemmTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::fp8_t;
-    using BDataType   = ck_tile::fp8_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile =
+        get_k_from_preshuffled_warp_tile<PrecType, M_Warp_Tile>();

-template <>
-struct GemmTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::bf8_t;
-    using BDataType   = ck_tile::bf8_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>
-{
-    using ADataType   = ck_tile::half_t;
-    using BDataType   = ck_tile::pk_int4_t;
-    using AccDataType = float;
-    using CDataType   = ck_tile::half_t;
-};
-
-template <>
-struct GemmTypeConfig<ck_tile::int8_t, ck_tile::int8_t, int32_t>
-{
-    using ADataType   = ck_tile::int8_t;
-    using BDataType   = ck_tile::int8_t;
-    using AccDataType = int32_t;
-    using CDataType   = int32_t;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE;
+    static constexpr bool Preshuffle           = true;
+    static constexpr bool DoubleSmemBuffer     = false;
 };

 template <typename ADataType_,
@@ -424,7 +389,7 @@ struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, float>
    using QDataType   = float;
    using BDataType   = ck_tile::fp8_t;
    using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };

 template <>
@@ -434,7 +399,7 @@ struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, float>
    using QDataType   = float;
    using BDataType   = ck_tile::bf8_t;
    using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };

 template <>
@@ -444,7 +409,7 @@ struct GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, ck_tile::f
    using QDataType   = ck_tile::fp8_t;
    using BDataType   = ck_tile::fp8_t;
    using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };

 template <>
@@ -454,7 +419,7 @@ struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, float, ck_tile::fp8_t
    using QDataType   = ck_tile::fp8_t;
    using BDataType   = ck_tile::fp8_t;
    using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };

 template <>
@@ -464,7 +429,7 @@ struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, float, ck_tile::bf8_t
    using QDataType   = ck_tile::bf8_t;
    using BDataType   = ck_tile::bf8_t;
    using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };

 template <>
@@ -474,7 +439,7 @@ struct GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, ck_tile::b
    using QDataType   = ck_tile::bf8_t;
    using BDataType   = ck_tile::bf8_t;
    using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };

 template <>
@@ -484,7 +449,7 @@ struct GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::fp8_t, float, float>
    using QDataType   = float;
    using BDataType   = ck_tile::fp8_t;
    using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };

 template <>
@@ -494,7 +459,7 @@ struct GemmQuantTypeConfig<ck_tile::pk_int4_t, ck_tile::bf8_t, float, float>
    using QDataType   = float;
    using BDataType   = ck_tile::bf8_t;
    using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };

 template <>
@@ -504,7 +469,7 @@ struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::pk_int4_t, float, ck_tile::f
    using QDataType   = ck_tile::fp8_t;
    using BDataType   = ck_tile::pk_int4_t;
    using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };

 template <>
@@ -514,7 +479,7 @@ struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::pk_int4_t, float, ck_tile::b
    using QDataType   = ck_tile::bf8_t;
    using BDataType   = ck_tile::pk_int4_t;
    using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };

 template <>
@@ -524,7 +489,7 @@ struct GemmQuantTypeConfig<ck_tile::fp8_t, ck_tile::pk_int4_t, float, float>
    using QDataType   = float;
    using BDataType   = ck_tile::pk_int4_t;
    using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };

 template <>
@@ -534,7 +499,7 @@ struct GemmQuantTypeConfig<ck_tile::bf8_t, ck_tile::pk_int4_t, float, float>
    using QDataType   = float;
    using BDataType   = ck_tile::pk_int4_t;
    using AccDataType = float;
-    using CDataType   = float;
+    using CDataType   = ck_tile::half_t;
 };

 template <typename T>
@@ -660,7 +625,7 @@ auto create_args(int argc, char* argv[])
        .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
        .insert("prec", "i4fp8", "data type. fp8/bf8/i4fp8/i4bf8/i4f32fp8/i4f32bf8")
        .insert("warmup", "50", "number of iterations before benchmark the kernel")
-        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("repeat", "1000", "number of iterations to benchmark the kernel")
        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
        .insert("split_k", "1", "splitK value")
        .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
--- a/example/ck_tile/38_block_scale_gemm/run_gemm_aquant_example.inc
+++ b/example/ck_tile/38_block_scale_gemm/run_gemm_aquant_example.inc
@@ -4,6 +4,7 @@
 #pragma once
 #include <bit>
 #include <random>
+#include <stdexcept>

 template <typename Layout>
 static constexpr inline auto is_row_major(Layout layout_)
@@ -12,6 +13,24 @@ static constexpr inline auto is_row_major(Layout layout_)
                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
 }

+template <typename T>
+auto shuffle_aq(const ck_tile::HostTensor<T>& t, int block_aq_k)
+{
+    if(t.get_lengths().size() != 2)
+    {
+        throw std::runtime_error("Host tensor is not rank 2 tensor.");
+    }
+    int m_   = t.get_lengths()[0];
+    int aqk_ = t.get_lengths()[1];
+    if(aqk_ % block_aq_k != 0)
+    {
+        throw std::runtime_error("shuffle_aq needs a aqk of multiple times of block_aq_k.");
+    }
+    ck_tile::HostTensor<T> t_view({m_, aqk_ / block_aq_k, block_aq_k});
+    std::copy(t.begin(), t.end(), t_view.begin());
+    return ck_tile::reference_permute(t_view, {1, 0, 2});
+}
+
 template <typename ADataType,
          typename AQDataType,
          typename BDataType,
@@ -21,7 +40,8 @@ template <typename ADataType,
          typename AQLayout,
          typename BLayout,
          typename CLayout,
-          uint32_t QuantGroupSize>
+          uint32_t QuantGroupSize,
+          bool Preshuffle = false>
 float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                  ck_tile::DeviceMem& aq_m_aqk_dev_buf,
                  ck_tile::DeviceMem& b_k_n_dev_buf,
@@ -62,7 +82,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                                      ALayout,
                                      BLayout,
                                      CLayout,
-                                      QuantGroupSize>(
+                                      QuantGroupSize,
+                                      Preshuffle>(
        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});

    std::size_t flop     = std::size_t(2) * M * N * K;
@@ -85,7 +106,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
    return ave_time;
 }

-template <typename TypeConfig,
+template <typename GemmConfig,
+          typename TypeConfig,
          uint32_t QuantGroupSize,
          typename ALayout,
          typename AQLayout,
@@ -184,8 +206,18 @@ int run_gemm_example_with_layouts(int argc,
    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());

+    if constexpr(GemmConfig::Preshuffle)
+    {
+        ck_tile::HostTensor<AQDataType> aq_shuffle_host =
+            shuffle_aq(aq_m_aqk, GemmConfig::K_Tile / QuantGroupSize);
+        aq_m_aqk_dev_buf.ToDevice(aq_shuffle_host.data());
+    }
+    else
+    {
+        aq_m_aqk_dev_buf.ToDevice(aq_m_aqk.data());
+    }
+
    a_m_k_dev_buf.ToDevice(a_m_k.data());
-    aq_m_aqk_dev_buf.ToDevice(aq_m_aqk.data());
    b_k_n_dev_buf.ToDevice(b_k_n.data());
    c_m_n_dev_buf.SetZero();
    c_m_n_dev_result.SetZero();
@@ -199,21 +231,22 @@ int run_gemm_example_with_layouts(int argc,
                AQLayout,
                BLayout,
                CLayout,
-                QuantGroupSize>(a_m_k_dev_buf,
-                                aq_m_aqk_dev_buf,
-                                b_k_n_dev_buf,
-                                c_m_n_dev_buf,
-                                M,
-                                N,
-                                K,
-                                AQK,
-                                stride_A,
-                                stride_AQ,
-                                stride_B,
-                                stride_C,
-                                kbatch,
-                                n_warmup,
-                                n_repeat);
+                QuantGroupSize,
+                GemmConfig::Preshuffle>(a_m_k_dev_buf,
+                                        aq_m_aqk_dev_buf,
+                                        b_k_n_dev_buf,
+                                        c_m_n_dev_buf,
+                                        M,
+                                        N,
+                                        K,
+                                        AQK,
+                                        stride_A,
+                                        stride_AQ,
+                                        stride_B,
+                                        stride_C,
+                                        kbatch,
+                                        n_warmup,
+                                        n_repeat);

    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
    bool pass = true;
--- a/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
@@ -156,6 +156,8 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmQuantBase<Problem_>

        static constexpr index_t KPack      = WarpGemm::kKPerThread;
        static constexpr index_t KPerThread = KIterPerWarp * WarpGemm::kKPerThread;
+
+        static constexpr bool Preshuffle = Problem::Traits::Preshuffle;
    };

    public:
@@ -322,6 +324,7 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmQuantBase<Problem_>
            static_assert(std::is_same_v<CDataType, typename CBlockTensor::DataType>,
                          "The CDataType as defined in traits should be the same as correspoinding "
                          "C block tensor data type!");
+            constexpr auto warp_size = get_warp_size();

            // hot loop:
            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
@@ -354,82 +357,153 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmQuantBase<Problem_>
                            }
                        });

-                        // Need to multiply aquant with accumulated C
-                        //
-                        // The accumulated C tile has the standard distribution. For example
-                        // lane 0 holds elements [0,0], [1,0], [2,0], [3,0], [8,0], [9,0],
-                        // [10,0], [11,0], [16,0], [17,0], [18,0], [19,0], [24,0], [25,0],
-                        // [26,0], [27,0].
-                        //
-                        // These elements are in different rows, need to get the scale value
-                        // for the corresponding row.
-                        // Based on aquant's tile distribution, it can be inferred which
-                        // lane holds the relevant scale. For example, the scales corresponding
-                        // to the 16 elements held by lane 0 are held by lanes 0, 1, 2, 3, 8, 9,
-                        // 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 respectively.
-                        //
-                        // These scales can be obtained using __builtin_amdgcn_ds_bpermute.
+                        if constexpr(Traits::Preshuffle)
+                        {
+                            // A view is created on top of the preshuffled AQ, where each row of the
+                            // view is composed of a row from a warp tile within an AQ block tile.
+                            // Multiple warp tile rows that belong to the same block tile are laid
+                            // out as consecutive rows.
+                            //
+                            // When we need to multiply a C warp tile with an AQ warp tile, thread 0
+                            // in the warp will load AQ_warp_tile[0], thread 1 will load
+                            // AQ_warp_tile[1], and so on, up to thread 63, which will load
+                            // AQ_warp_tile[63]. The VGPR file in the warp acts similarly to LDS in
+                            // this context, but we use cross-lane operations to access the data.
+                            // (Cross-lane operations are faster than using LDS.)
+                            //
+                            // Note that when the size of the AQ warp tile is smaller than the warp
+                            // size, you need to pad the rows in the view to ensure that each thread
+                            // can read one element.
+                            constexpr auto tbuf_offset =
+                                number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
+                                           merge_sequences(sequence<mIter, nIter>{},
+                                                           c_warp_y_index_zeros)) /
+                                       CBlockTensor::PackedSize>{};
+                            constexpr uint32_t kTileRowsOfCPerThread = 4;

-                        // MIters per warp
-                        constexpr index_t mIters_per_warp = get_warp_size() / WarpGemm::kM;
+                            static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
+                                [&](auto c_row) {
+                                    // For a warp tile of [16x16x32], take thread 0 as an example.
+                                    // Its VGPR[0] stores the value from C_tile[0,0], VGPR[1] stores
+                                    // C_tile[1,0], VGPR[2] stores C_tile[2,0], and VGPR[3] stores
+                                    // C_tile[3,0]. This means VGPR[0] should be multiplied by
+                                    // AQ_tile[0, 0], VGPR[1] by AQ_tile[1, 0], VGPR[2] by
+                                    // AQ_tile[2, 0], and VGPR[3] by AQ_tile[3, 0].

-                        // Reg block offset based on mIter
-                        constexpr index_t reg_block_offset =
-                            ((mIter / mIters_per_warp) * Traits::AQPerBlock);
+                                    // Thread 0 can read AQ_tile[0, 0] from itself, AQ_tile[1, 0]
+                                    // from thread 1, ..., and AQ_tile[3, 0] from thread 3.
+                                    auto pull_from_lane =
+                                        ((threadIdx.x & (warp_size - 1)) / Traits::WarpGemm::kN *
+                                             kTileRowsOfCPerThread +
+                                         c_row) *
+                                            Traits::QScalesPerBlockRow +
+                                        kQScale;
+                                    auto& scale_reg = aq_block_tensor.get_thread_buffer()[mIter];

-                        constexpr index_t lane_base_offset =
-                            (mIter % mIters_per_warp) * WarpGemm::kM;
+                                    // cross lane ops
+                                    uint32_t scale_reg_dword;

-                        // Scale tensor offset along K
-                        constexpr index_t src_reg_offset = reg_block_offset + kQScale;
+                                    if constexpr(std::is_same_v<AQDataType, float>)
+                                    {
+                                        scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
+                                    }
+                                    else
+                                    {
+                                        scale_reg_dword = static_cast<uint32_t>(scale_reg);
+                                    }

-                        constexpr uint32_t kTileRows        = 4;
-                        constexpr uint32_t kTiledCMsPerWarp = WarpGemm::kCMLane * kTileRows;
+                                    int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
+                                        pull_from_lane << 2,
+                                        __builtin_bit_cast(int, scale_reg_dword));

-                        constexpr auto tbuf_offset =
-                            number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
-                                       merge_sequences(sequence<mIter, nIter>{},
-                                                       c_warp_y_index_zeros)) /
-                                   CBlockTensor::PackedSize>{};
+                                    float scale_reg_f = Base::cvt_scale_to_fp32(gathered_scale_reg);

-                        static_for<0, WarpGemm::kM, WarpGemm::kCMLane>{}([&](auto c_row) {
-                            // Multiply by 4 because output is stored in tiles of 4
-                            // x CNLane
-                            constexpr uint32_t row_base =
-                                ((c_row / kTiledCMsPerWarp) * kTiledCMsPerWarp) +
-                                ((c_row % kTiledCMsPerWarp) / WarpGemm::kCMLane);
+                                    c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
+                                        (c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f *
+                                         kA_cvt_scale * kB_cvt_scale);
+                                });
+                        }
+                        else
+                        {
+                            // Need to multiply aquant with accumulated C
+                            //
+                            // The accumulated C tile has the standard distribution. For example
+                            // lane 0 holds elements [0,0], [1,0], [2,0], [3,0], [8,0], [9,0],
+                            // [10,0], [11,0], [16,0], [17,0], [18,0], [19,0], [24,0], [25,0],
+                            // [26,0], [27,0].
+                            //
+                            // These elements are in different rows, need to get the scale value
+                            // for the corresponding row.
+                            // Based on aquant's tile distribution, it can be inferred which
+                            // lane holds the relevant scale. For example, the scales corresponding
+                            // to the 16 elements held by lane 0 are held by lanes 0, 1, 2, 3, 8, 9,
+                            // 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 respectively.
+                            //
+                            // These scales can be obtained using __builtin_amdgcn_ds_bpermute.

-                            constexpr uint32_t reg_offset_for_row_data = c_row / WarpGemm::kCMLane;
+                            // MIters per warp
+                            constexpr index_t mIters_per_warp = get_warp_size() / WarpGemm::kM;

-                            // Lane index to source scale from
-                            uint32_t src_lane_idx = lane_base_offset + row_base +
-                                                    (__lane_id() / WarpGemm::kN * kTileRows);
+                            // Reg block offset based on mIter
+                            constexpr index_t reg_block_offset =
+                                ((mIter / mIters_per_warp) * Traits::AQPerBlock);

-                            // Directly index into thread buffer corresponding to
-                            // desired row coefficient
-                            auto& scale_reg = aq_block_tensor.get_thread_buffer()[src_reg_offset];
-                            uint32_t scale_reg_dword;
+                            constexpr index_t lane_base_offset =
+                                (mIter % mIters_per_warp) * WarpGemm::kM;

-                            if constexpr(std::is_same_v<AQDataType, float>)
-                            {
-                                scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
-                            }
-                            else
-                            {
-                                scale_reg_dword = static_cast<uint32_t>(scale_reg);
-                            }
+                            // Scale tensor offset along K
+                            constexpr index_t src_reg_offset = reg_block_offset + kQScale;

-                            // Pull scale data across lanes
-                            int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
-                                src_lane_idx * 4, __builtin_bit_cast(int, scale_reg_dword));
+                            constexpr uint32_t kTileRows        = 4;
+                            constexpr uint32_t kTiledCMsPerWarp = WarpGemm::kCMLane * kTileRows;

-                            float scale_reg_f = Base::cvt_scale_to_fp32(gathered_scale_reg);
+                            constexpr auto tbuf_offset =
+                                number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
+                                           merge_sequences(sequence<mIter, nIter>{},
+                                                           c_warp_y_index_zeros)) /
+                                       CBlockTensor::PackedSize>{};

-                            c_block_tensor
-                                .get_thread_buffer()[tbuf_offset + reg_offset_for_row_data] +=
-                                (c_warp_tensor.get_thread_buffer()[reg_offset_for_row_data] *
-                                 scale_reg_f * kA_cvt_scale * kB_cvt_scale);
-                        });
+                            static_for<0, WarpGemm::kM, WarpGemm::kCMLane>{}([&](auto c_row) {
+                                // Multiply by 4 because output is stored in tiles of 4
+                                // x CNLane
+                                constexpr uint32_t row_base =
+                                    ((c_row / kTiledCMsPerWarp) * kTiledCMsPerWarp) +
+                                    ((c_row % kTiledCMsPerWarp) / WarpGemm::kCMLane);
+
+                                constexpr uint32_t reg_offset_for_row_data =
+                                    c_row / WarpGemm::kCMLane;
+
+                                // Lane index to source scale from
+                                uint32_t src_lane_idx = lane_base_offset + row_base +
+                                                        (__lane_id() / WarpGemm::kN * kTileRows);
+
+                                // Directly index into thread buffer corresponding to
+                                // desired row coefficient
+                                auto& scale_reg =
+                                    aq_block_tensor.get_thread_buffer()[src_reg_offset];
+                                uint32_t scale_reg_dword;
+
+                                if constexpr(std::is_same_v<AQDataType, float>)
+                                {
+                                    scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
+                                }
+                                else
+                                {
+                                    scale_reg_dword = static_cast<uint32_t>(scale_reg);
+                                }
+
+                                // Pull scale data across lanes
+                                int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
+                                    src_lane_idx * 4, __builtin_bit_cast(int, scale_reg_dword));
+
+                                float scale_reg_f = Base::cvt_scale_to_fp32(gathered_scale_reg);
+
+                                c_block_tensor
+                                    .get_thread_buffer()[tbuf_offset + reg_offset_for_row_data] +=
+                                    (c_warp_tensor.get_thread_buffer()[reg_offset_for_row_data] *
+                                     scale_reg_f * kA_cvt_scale * kB_cvt_scale);
+                            });
+                        }
                    });
                });
            });
--- a/include/ck_tile/ops/gemm_group_quant/kernel/gemm_aquant_kernel.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/kernel/gemm_aquant_kernel.hpp
@@ -3,11 +3,14 @@

 #pragma once

-#include <iostream>
 #include <string>

 #include "ck_tile/core.hpp"
-#include "ck_tile/ops/common.hpp"
+#include "ck_tile/core/algorithm/coordinate_transform.hpp"
+#include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/core/container/tuple.hpp"
+#include "ck_tile/core/numeric/integer.hpp"
+#include "ck_tile/core/numeric/math.hpp"
 #include "ck_tile/host/concat.hpp"

 namespace ck_tile {
@@ -104,6 +107,7 @@ struct AQuantGemmKernel
    using BLayout                            = remove_cvref_t<typename GemmPipeline::BLayout>;
    using CLayout                            = remove_cvref_t<typename GemmPipeline::CLayout>;
    static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
+    static constexpr bool Preshuffle         = GemmPipeline::Preshuffle;

    using ADataType  = remove_cvref_t<typename GemmPipeline::ADataType>;
    using AQDataType = remove_cvref_t<typename GemmPipeline::AQDataType>;
@@ -157,7 +161,7 @@ struct AQuantGemmKernel
        __device__ SplitKBatchOffset(const AQuantGemmKernelArgs& kargs,
                                     const std::size_t k_id = blockIdx.z)
        {
-            constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
+            constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(I2);
            const index_t K_t   = __builtin_amdgcn_readfirstlane(kargs.k_batch * K1);
            const index_t KRead = __builtin_amdgcn_readfirstlane((kargs.K + K_t - 1) / K_t * K1);

@@ -372,14 +376,75 @@ struct AQuantGemmKernel
            }
        }();

+        const auto get_padding_size = [](index_t length, index_t alignment) {
+            return ck_tile::integer_least_multiple(length, alignment) - length;
+        };
+
+        const auto& make_preshuffled_aq_tensor_view = [&]() {
+            const auto aq_x = kargs.M * GemmPipeline::KPerBlockAQ;
+            const auto aq_y = kargs.QK / GemmPipeline::KPerBlockAQ;
+
+            const auto aq_desc =
+                make_naive_tensor_descriptor(make_tuple(aq_y, aq_x),
+                                             make_tuple(aq_x, 1),
+                                             number<GemmPipeline::GetVectorSizeAQ()>{},
+                                             number<1>{});
+
+            const auto block_tile_size = GemmPipeline::MPerBlock * GemmPipeline::KPerBlockAQ;
+            const auto aq_pad0_desc    = transform_tensor_descriptor(
+                aq_desc,
+                make_tuple(make_pass_through_transform(aq_y),
+                           make_right_pad_transform(aq_x, get_padding_size(aq_x, block_tile_size))),
+                make_tuple(sequence<0>{}, sequence<1>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+
+            const auto pad_aq_x = aq_pad0_desc.get_lengths()[I1];
+            const auto wave_tile_size =
+                TilePartitioner::BlockGemmShape::WarpTile::at(I0) * GemmPipeline::KPerBlockAQ;
+            const auto wave_tile_count_x = ck_tile::integer_divide_ceil(pad_aq_x, wave_tile_size);
+            const auto aq_unmerge_pad0_desc = transform_tensor_descriptor(
+                aq_pad0_desc,
+                make_tuple(make_pass_through_transform(aq_y),
+                           make_unmerge_transform(make_tuple(wave_tile_count_x, wave_tile_size))),
+                make_tuple(sequence<0>{}, sequence<1>{}),
+                make_tuple(sequence<0>{}, sequence<1, 2>{}));
+
+            const auto aq_pad1_desc = transform_tensor_descriptor(
+                aq_unmerge_pad0_desc,
+                make_tuple(make_pass_through_transform(aq_y),
+                           make_pass_through_transform(wave_tile_count_x),
+                           make_right_pad_transform(
+                               wave_tile_size, get_padding_size(wave_tile_size, get_warp_size()))),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}));
+
+            const auto pad_wave_size =
+                ck_tile::integer_least_multiple(wave_tile_size, get_warp_size());
+            const auto aq_merge_pad1_desc = transform_tensor_descriptor(
+                aq_pad1_desc,
+                make_tuple(make_merge_transform(make_tuple(wave_tile_count_x, aq_y)),
+                           make_pass_through_transform(pad_wave_size)),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+
+            return make_tensor_view<address_space_enum::global>(aq_ptr, aq_merge_pad1_desc);
+        };
+
        const auto& aq_tensor_view = [&]() {
            static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
-            return make_naive_tensor_view<address_space_enum::global>(
-                aq_ptr,
-                make_tuple(kargs.M, kargs.QK),
-                make_tuple(kargs.stride_AQ, 1),
-                number<GemmPipeline::GetVectorSizeAQ()>{},
-                number<1>{});
+            if constexpr(Preshuffle)
+            {
+                return make_preshuffled_aq_tensor_view();
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    aq_ptr,
+                    make_tuple(kargs.M, kargs.QK),
+                    make_tuple(kargs.stride_AQ, 1),
+                    number<GemmPipeline::GetVectorSizeAQ()>{},
+                    number<1>{});
+            }
        }();

        const auto& b_tensor_view = [&]() {
@@ -491,16 +556,7 @@ struct AQuantGemmKernel
            }
        }();

-        const auto& aq_pad_view = [&]() {
-            const auto& aq_tensor_view = views.at(I1);
-            static_assert(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>);
-            return pad_tensor_view(
-                aq_tensor_view,
-                make_tuple(number<TilePartitioner::MPerBlock>{},
-                           number<TilePartitioner::KPerBlock / GemmPipeline::QuantGroupSize>{}),
-                // TODO: Add support for padding.
-                sequence<false, false>{});
-        }();
+        const auto& aq_pad_view = [&]() { return views.at(I1); }();

        const auto& b_pad_view = [&]() {
            const auto& b_tensor_view = views.at(I2);
@@ -543,8 +599,10 @@ struct AQuantGemmKernel
    }

    template <typename PadView>
-    CK_TILE_DEVICE static auto
-    MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n)
+    CK_TILE_DEVICE static auto MakeGemmTileWindows(const PadView& views,
+                                                   const AQuantGemmKernelArgs& kargs,
+                                                   const index_t i_m,
+                                                   const index_t i_n)
    {
        const auto& a_pad_view  = views.at(I0);
        const auto& aq_pad_view = views.at(I1);
@@ -570,11 +628,26 @@ struct AQuantGemmKernel

        const auto& aq_block_window = [&]() {
            static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
-            return make_tile_window(
-                aq_pad_view,
-                make_tuple(number<TilePartitioner::MPerBlock>{},
-                           number<TilePartitioner::KPerBlock / GemmPipeline::QuantGroupSize>{}),
-                {i_m, 0});
+            if constexpr(Preshuffle)
+            {
+                constexpr auto tile_window_width = get_warp_size();
+                constexpr auto tile_window_height =
+                    TilePartitioner::MPerBlock / TilePartitioner::BlockGemmShape::WarpTile::at(I0);
+                auto block_m_idx = i_m / TilePartitioner::MPerBlock;
+                return make_tile_window(
+                    aq_pad_view,
+                    make_tuple(number<tile_window_height>{}, number<tile_window_width>{}),
+                    {block_m_idx * kargs.K / TilePartitioner::BlockGemmShape::BlockTile::at(I2),
+                     0});
+            }
+            else
+            {
+                return make_tile_window(
+                    aq_pad_view,
+                    make_tuple(number<TilePartitioner::MPerBlock>{},
+                               number<TilePartitioner::KPerBlock / GemmPipeline::QuantGroupSize>{}),
+                    {i_m, 0});
+            }
        }();

        const auto& b_block_window = [&]() {
@@ -633,7 +706,8 @@ struct AQuantGemmKernel
            a_ptr, b_ptr, aq_ptr, c_ptr, kargs, splitk_batch_offset);

        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
-        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+        auto gemm_tile_windows =
+            MakeGemmTileWindows(gemm_pad_views, kargs, block_idx_m, block_idx_n);

        const index_t num_loop = __builtin_amdgcn_readfirstlane(
            TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k));
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp
@@ -38,12 +38,9 @@ struct GemmAQuantPipelineAgBgCrImplBase : public GemmPipelineAgBgCrImplBase<Prob
    {
        static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);

-        using YPerTile = number<MPerBlock>;
-        using XPerTile = number<KPerBlockAQ>;
-
        auto aq_copy_dram_window =
            make_tile_window(aq_dram_block_window_tmp.get_bottom_tensor_view(),
-                             make_tuple(YPerTile(), XPerTile()),
+                             aq_dram_block_window_tmp.get_window_lengths(),
                             aq_dram_block_window_tmp.get_window_origin(),
                             Policy::template MakeAQDramTileDistribution<Problem>());
        return aq_copy_dram_window;
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_policy.hpp
@@ -42,6 +42,7 @@ struct GemmAQuantPipelineAgBgCrDefaultPolicy : public UniversalGemmPipelineAgBgC
        constexpr index_t KPerBlock   = Problem::BlockGemmShape::kK;
        constexpr index_t KPerBlockAQ = KPerBlock / Problem::kQuantGroupSize;
        constexpr index_t VecLoadSize = GetVectorSizeAQ<Problem>();
+        constexpr bool Preshuffle     = Problem::Traits::Preshuffle;
        using WarpTile                = typename Problem::BlockGemmShape::WarpTile;
        using WarpGemm                = WarpGemmMfmaDispatcher<typename Problem::ComputeDataType,
                                                               typename Problem::ComputeDataType,
@@ -52,14 +53,34 @@ struct GemmAQuantPipelineAgBgCrDefaultPolicy : public UniversalGemmPipelineAgBgC
                                                               false>;

        static_assert(std::is_same_v<AQLayout, tensor_layout::gemm::RowMajor>);
-        using TileEncodingPattern = TileDistributionEncodingPatternAQ<BlockGemmShape,
-                                                                      WarpGemm,
-                                                                      BlockSize,
-                                                                      MPerBlock,
-                                                                      KPerBlockAQ,
-                                                                      VecLoadSize>;
+        if constexpr(Preshuffle)
+        {
+            using TileEncodingPattern =
+                TileDistributionEncodingPatternAQ<BlockGemmShape,
+                                                  WarpGemm,
+                                                  BlockSize,
+                                                  MPerBlock / WarpGemm::kM,
+                                                  ck_tile::integer_least_multiple(
+                                                      WarpGemm::kM * KPerBlockAQ, get_warp_size()),
+                                                  KPerBlockAQ,
+                                                  VecLoadSize,
+                                                  Preshuffle>;

-        return TileEncodingPattern::Make2DStaticTileDistribution();
+            return TileEncodingPattern::Make2DStaticTileDistribution();
+        }
+        else
+        {
+            using TileEncodingPattern = TileDistributionEncodingPatternAQ<BlockGemmShape,
+                                                                          WarpGemm,
+                                                                          BlockSize,
+                                                                          MPerBlock,
+                                                                          KPerBlockAQ,
+                                                                          KPerBlockAQ,
+                                                                          VecLoadSize,
+                                                                          Preshuffle>;
+
+            return TileEncodingPattern::Make2DStaticTileDistribution();
+        }
    }

    template <typename Problem>
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
@@ -7,7 +7,6 @@
 #include <sstream>

 #include "ck_tile/core.hpp"
-#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/gemm_group_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_base.hpp"
 #include "ck_tile/host/concat.hpp"
@@ -134,6 +133,7 @@ struct AQuantGemmPipelineAgBgCrCompV3 : public BaseAQuantGemmPipelineAgBgCrCompV
    static constexpr bool kPadK = Problem::kPadK;

    static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
+    static constexpr bool Preshuffle       = Problem::Traits::Preshuffle;

    static constexpr bool HasHotLoop = Problem::HasHotLoop;
    static constexpr auto TailNum    = Problem::TailNum;
@@ -254,9 +254,6 @@ struct AQuantGemmPipelineAgBgCrCompV3 : public BaseAQuantGemmPipelineAgBgCrCompV
            constexpr bool is_b_row_major = std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>;

            static_assert(!is_aq_col_major, "Aq must be row major (col major not supported yet)");
-            static_assert(MPerBlock == AQDramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
-                              KPerBlockAQ == AQDramBlockWindowTmp{}.get_window_lengths()[I1{}],
-                          "Aq block window has incorrect lengths for defined AqLayout!");

            static_assert(is_a_col_major
                              ? (KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
@@ -312,8 +309,11 @@ struct AQuantGemmPipelineAgBgCrCompV3 : public BaseAQuantGemmPipelineAgBgCrCompV
                is_a_col_major ? make_array(KPerBlock, 0) : make_array(0, KPerBlock);
            constexpr BDramTileWindowStep b_dram_tile_window_step =
                is_b_row_major ? make_array(KPerBlock, 0) : make_array(0, KPerBlock);
+
+            // only row_major for AQ
            constexpr AQDramTileWindowStep aq_dram_tile_window_step =
-                is_aq_col_major ? make_array(KPerBlockAQ, 0) : make_array(0, KPerBlockAQ);
+                Preshuffle ? make_array(MPerBlock / BlockGemm::WarpGemm::kM, 0)
+                           : make_array(0, KPerBlockAQ);

            // DRAM prefetch (global read 0)
            Base::GlobalPrefetch(a_block_tile, a_copy_dram_window, a_dram_tile_window_step);
--- a/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_group_quant_utils.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/gemm_group_quant_utils.hpp
@@ -50,10 +50,11 @@ template <typename BlockGemmShape,
          index_t BlockSize,
          index_t YPerTile,
          index_t XPerTile,
-          index_t VecSize>
+          index_t KPerBlockAQ,
+          index_t VecSize,
+          bool Preshuffle>
 struct TileDistributionEncodingPatternAQ : public TileDistributionEncodingPattern
 {
-    // TODO: make pattern where below condition does not need to hold - GGemmMultiDSplitk!
    static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
    static constexpr index_t warp_size = get_warp_size();
    static constexpr index_t num_warps = BlockSize / get_warp_size();
@@ -69,26 +70,46 @@ struct TileDistributionEncodingPatternAQ : public TileDistributionEncodingPatter
    // KWarps > 1 isn't supported
    static_assert(KWarps == 1);

-    // # of elements per thread
-    static constexpr index_t X = XPerTile;
-
-    static constexpr index_t Y0 = 1;
-    static constexpr index_t Y1 = MIterPerWarp ? MIterPerWarp : 1;
-    static constexpr index_t Y2 = MWarps;
-    static constexpr index_t Y3 = WarpGemm::kM;
-    static_assert(Y3 >= WarpGemm::kM, "Scales for all rows must be available within the warp.");
-    static_assert(Y0 * Y1 * Y2 * Y3 == YPerTile,
-                  "Y0, Y1, Y2, Y3 must cover the blocktile along Y.");
-
    CK_TILE_HOST_DEVICE static constexpr auto Make2DStaticTileDistribution()
    {
-        return make_static_tile_distribution(
-            tile_distribution_encoding<sequence<NWarps>,
-                                       tuple<sequence<Y0, Y1, Y2, Y3>, sequence<X>>,
-                                       tuple<sequence<1, 0>, sequence<1, 1>>,
-                                       tuple<sequence<2, 0>, sequence<0, 3>>,
-                                       sequence<1, 2>,
-                                       sequence<1, 0>>{});
+        if constexpr(Preshuffle)
+        {
+            // # of elements per thread
+            constexpr index_t X2 = KPerBlockAQ;
+            constexpr index_t X1 = warp_size / X2;
+            constexpr index_t X0 = XPerTile / warp_size;
+
+            constexpr index_t Y1 = MWarps;
+            constexpr index_t Y0 = YPerTile / Y1;
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<NWarps>,
+                                           tuple<sequence<Y0, Y1>, sequence<X0, X1, X2>>,
+                                           tuple<sequence<1, 0>, sequence<2, 2>>,
+                                           tuple<sequence<1, 0>, sequence<1, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<0, 0>>{});
+        }
+        else
+        {
+            // # of elements per thread
+            constexpr index_t X = XPerTile;
+
+            constexpr index_t Y0 = 1;
+            constexpr index_t Y1 = MIterPerWarp ? MIterPerWarp : 1;
+            constexpr index_t Y2 = MWarps;
+            constexpr index_t Y3 = WarpGemm::kM;
+            static_assert(Y3 >= WarpGemm::kM,
+                          "Scales for all rows must be available within the warp.");
+            static_assert(Y0 * Y1 * Y2 * Y3 == YPerTile,
+                          "Y0, Y1, Y2, Y3 must cover the blocktile along Y.");
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<NWarps>,
+                                           tuple<sequence<Y0, Y1, Y2, Y3>, sequence<X>>,
+                                           tuple<sequence<1, 0>, sequence<1, 1>>,
+                                           tuple<sequence<2, 0>, sequence<0, 3>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 0>>{});
+        }
    }
 };

--- a/include/ck_tile/ops/gemm_group_quant/pipeline/tile_gemm_aquant_traits.hpp
+++ b/include/ck_tile/ops/gemm_group_quant/pipeline/tile_gemm_aquant_traits.hpp
@@ -10,6 +10,7 @@ namespace ck_tile {
 template <bool kPadM_,
          bool kPadN_,
          bool kPadK_,
+          bool Preshuffle_,
          typename ALayout_,
          typename BLayout_,
          typename CLayout_,
@@ -29,6 +30,7 @@ struct TileGemmAQuantTraits

    static constexpr bool UseStructuredSparsity = false;
    static constexpr index_t NumWaveGroups      = 1;
+    static constexpr bool Preshuffle            = Preshuffle_;
 };

 } // namespace ck_tile
--- a/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc
+++ b/test/ck_tile/gemm_block_scale/test_run_gemm_aquant_example.inc
@@ -24,7 +24,8 @@ template <typename ADataType,
          typename ALayout,
          typename BLayout,
          typename CLayout,
-          uint32_t QuantGroupSize>
+          uint32_t QuantGroupSize,
+          bool Preshuffle = false>
 float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::stream_config& s)
 {
    constexpr bool kPadM = false;
@@ -55,7 +56,7 @@ float gemm_calc_aquant(const ck_tile::AQuantGemmHostArgs& args, const ck_tile::s
    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;

    using CodegenGemmTraits =
-        ck_tile::TileGemmAQuantTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+        ck_tile::TileGemmAQuantTraits<kPadM, kPadN, kPadK, Preshuffle, ALayout, BLayout, CLayout>;

    using GemmPipelineProblem = ck_tile::GemmPipelineProblemBase<ADataType,
                                                                 BDataType,
@@ -161,7 +162,8 @@ template <typename ADataType,
          typename AQLayout,
          typename BLayout,
          typename CLayout,
-          uint32_t QuantGroupSize>
+          uint32_t QuantGroupSize,
+          bool Preshuffle = false>
 float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                  ck_tile::DeviceMem& aq_m_aqk_dev_buf,
                  ck_tile::DeviceMem& b_k_n_dev_buf,
@@ -202,7 +204,8 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                                      ALayout,
                                      BLayout,
                                      CLayout,
-                                      QuantGroupSize>(
+                                      QuantGroupSize,
+                                      Preshuffle>(
        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});

    std::size_t flop     = std::size_t(2) * M * N * K;