Merge branch 'develop' into ck_tile/gemm_blockscale_abquant

2026-06-30 19:57:40 +00:00 · 2025-12-12 01:06:46 +08:00
parent c2bb4d261f 4dcc3e59c1
commit 2debb6ca08
179 changed files with 9598 additions and 2091 deletions
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -77,3 +77,5 @@ example_compile_options(example_moe_gemm1_xdl_fp8_blockscale PRIVATE ${BLOCKSCAL
 add_example_executable(example_gemm_add_add_wmma_fp16 gemm_add_add_wmma_fp16.cpp)
 add_example_executable(example_gemm_multiply_multiply_wmma_fp16_bpreshuffle gemm_multiply_multiply_wmma_fp16_bpreshuffle.cpp)
 add_example_executable(example_gemm_multiply_multiply_wmma_fp8_bpreshuffle gemm_multiply_multiply_wmma_fp8_bpreshuffle.cpp)
+add_example_executable(example_gemm_multiply_multiply_wmma_fp8_ab_scale gemm_multiply_multiply_wmma_fp8_ab_scale.cpp)
+add_example_executable(example_gemm_multiply_multiply_wmma_fp8_blockscale_bpreshuffle gemm_multiply_multiply_wmma_fp8_blockscale_bpreshuffle.cpp)
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_wmma_fp8_ab_scale.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_wmma_fp8_ab_scale.cpp
@@ -0,0 +1,345 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3_ab_scale.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using FP8  = ck::f8_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = FP8;
+using A1DataType       = F32;
+using B0DataType       = FP8;
+using B1DataType       = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = BF16;
+
+using A0Layout = Row;
+using B0Layout = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr ck::index_t Scale_Block_M = 1;
+static constexpr ck::index_t Scale_Block_N = 128;
+static constexpr ck::index_t Scale_Block_K = 128;
+
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_ABScale_Wmma_CShuffle_V3
+    // clang-format off
+         <Row, Col, DsLayout, ELayout,
+          A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType,
+          AccDataType, CShuffleDataType, 
+          AElementOp,  BElementOp, CDEElementOp, GemmSpec,
+          256, Scale_Block_M, Scale_Block_N, Scale_Block_K,
+          128, 128, 128,
+          16, 16,
+          16, 16,
+          4, 2,
+          S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
+          S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
+          1, 1,  S<1, 32, 1,  8>,  S<8>,
+          ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, FP8>;
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    bool flush_cache     = true;
+
+    // GEMM shape
+    ck::index_t M = 128;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = K;
+    ck::index_t StrideE = N;
+
+    ck::index_t KBatch = 1;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 8 || argc == 9)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        flush_cache = std::stoi(argv[7]);
+
+        if(argc == 9)
+        {
+            KBatch = std::stoi(argv[8]);
+        }
+
+        StrideA = K;
+        StrideB = K;
+        StrideE = N;
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 6: M, N, K\n");
+        printf("arg7: flush both I$ and L2$ (0=no, 1=yes)\n");
+        printf("arg8: KBatch (default: 1)\n");
+        exit(0);
+    }
+
+    ck::index_t Scale_Stride_AM = (K + Scale_Block_K - 1) / Scale_Block_K;
+    ck::index_t Scale_Stride_BN = (K + Scale_Block_K - 1) / Scale_Block_K;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return ck::HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return ck::HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    ck::Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
+    ck::Tensor<A1DataType> a1_m_k(f_host_tensor_descriptor((M + Scale_Block_M - 1) / Scale_Block_M,
+                                                           (K + Scale_Block_K - 1) / Scale_Block_K,
+                                                           Scale_Stride_AM,
+                                                           A0Layout{}));
+    ck::Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
+    ck::Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor((K + Scale_Block_K - 1) / Scale_Block_K,
+                                                           (N + Scale_Block_N - 1) / Scale_Block_N,
+                                                           Scale_Stride_BN,
+                                                           B0Layout{}));
+    ck::Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    ck::Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
+    std::cout << "a1_m_k: " << a1_m_k.mDesc << std::endl;
+    std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
+    std::cout << "b1_k_n: " << b1_k_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_2<A1DataType>{-1, 1});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-1, 1});
+        break;
+    case 2:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_1<A1DataType>{});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
+        break;
+    case 3:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_1<A1DataType>{});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
+        break;
+    case 4:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0, 1.0});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
+        break;
+    case 5:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_1<A1DataType>{});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+        break;
+    default:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{-0.5, 0.5});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0, 1.0});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+    }
+
+    ck::DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
+    ck::DeviceMem a1_device_buf(sizeof(A1DataType) * a1_m_k.mDesc.GetElementSpaceSize());
+    ck::DeviceMem b0_device_buf(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize());
+    ck::DeviceMem b1_device_buf(sizeof(B1DataType) * b1_k_n.mDesc.GetElementSpaceSize());
+    ck::DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a0_device_buf.ToDevice(a0_m_k.mData.data());
+    a1_device_buf.ToDevice(a1_m_k.mData.data());
+    b0_device_buf.ToDevice(b0_k_n.mData.data());
+    b1_device_buf.ToDevice(b1_k_n.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+
+    // do GEMM
+    auto device_op      = DeviceOpInstance{};
+    std::string op_name = device_op.GetTypeString();
+    auto invoker        = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(static_cast<A0DataType*>(a0_device_buf.GetDeviceBuffer()),
+                               static_cast<B0DataType*>(b0_device_buf.GetDeviceBuffer()),
+                               std::array<const void*, NumDTensor>{},
+                               static_cast<EDataType*>(e_device_buf.GetDeviceBuffer()),
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               std::array<ck::index_t, NumDTensor>{},
+                               StrideE,
+                               static_cast<const A1DataType*>(a1_device_buf.GetDeviceBuffer()),
+                               static_cast<const B1DataType*>(b1_device_buf.GetDeviceBuffer()),
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op,
+                               KBatch);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
+
+    float ave_time = .0;
+
+    ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 0, 50, 100});
+
+    int pass = 0;
+
+    if(do_verification)
+    {
+        ck::Tensor<AccDataType> c_m_n({M, N});
+        ck::Tensor<float> a_m_k({M, K});
+        ck::Tensor<float> b_k_n({K, N});
+
+        for(int m = 0; m < M; m++)
+        {
+            for(int k = 0; k < K; k++)
+            {
+                a_m_k(m, k) = ck::type_convert<float>(a0_m_k(m, k)) *
+                              a1_m_k(m / Scale_Block_M, k / Scale_Block_K);
+            }
+        }
+
+        for(int n = 0; n < N; n++)
+        {
+            for(int k = 0; k < K; k++)
+            {
+                b_k_n(k, n) = ck::type_convert<float>(b0_k_n(k, n)) *
+                              b1_k_n(k / Scale_Block_K, n / Scale_Block_N);
+            }
+        }
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<float,
+                                                                                float,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                e_m_n_host_result(m, n) = ck::type_convert<EDataType>(c_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        pass = ck::utils::check_err(
+                   e_m_n_device_result, e_m_n_host_result, "Error: Incorrect results!", 5e-2, 5e-2)
+                   ? 0
+                   : 1;
+    }
+
+    if(flush_cache)
+    {
+        int rotating_buf = (512 * 1024 * 1024 + num_btype - 1) / num_btype;
+
+        ave_time = invoker.Run(argument,
+                               StreamConfig{nullptr, time_kernel, 0, 50, 100, true, rotating_buf});
+    }
+    else
+    {
+        ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel, 0, 50, 100});
+    }
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << op_name << ", KBatch " << KBatch << std::endl;
+
+    return pass;
+}
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_wmma_fp8_blockscale_bpreshuffle.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_wmma_fp8_blockscale_bpreshuffle.cpp
@@ -0,0 +1,357 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle_v3_blockscale_bpreshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+#include "ck/utility/blkgemmpipe_scheduler.hpp"
+
+#include "common.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using FP8  = ck::f8_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using A0DataType       = FP8;
+using A1DataType       = F32;
+using B0DataType       = FP8;
+using B1DataType       = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = BF16;
+
+using A0Layout = Row;
+using A1Layout = Col;
+using B0Layout = Col;
+using D0Layout = Row;
+using D1Layout = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+static constexpr int KPack = 16;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr ck::index_t Scale_Block_M = 1;
+static constexpr ck::index_t Scale_Block_N = 128;
+static constexpr ck::index_t Scale_Block_K = 128;
+
+using DeviceOpInstance =
+    ck::tensor_operation::device::DeviceGemmMultiD_BlockScale_Wmma_CShuffle_V3_BPreshuffle
+    // clang-format off
+         <Row, Col, DsLayout, ELayout,
+          A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, 
+          AElementOp,  BElementOp, CDEElementOp, GemmSpec,
+          256, Scale_Block_M, Scale_Block_N, Scale_Block_K,
+          128, 128, 128,
+          16, 16,
+          16, 16,
+          4, 2,
+          S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
+          2, 16, 16, 0,
+          S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
+          2, 16, 16, 0,
+          1, 1,
+          S<1, 32, 1, 8>,  S<8>,
+          ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, FP8>;
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    bool flush_cache     = true;
+
+    // GEMM shape
+    ck::index_t M = 128;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = K;
+    ck::index_t StrideE = N;
+
+    ck::index_t KBatch = 1;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 8 || argc == 9)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        flush_cache = std::stoi(argv[7]);
+
+        if(argc == 9)
+        {
+            KBatch = std::stoi(argv[8]);
+        }
+
+        StrideA = K;
+        StrideB = K;
+        StrideE = N;
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 6: M, N, K\n");
+        printf("arg7: flush both I$ and L2$ (0=no, 1=yes)\n");
+        printf("arg8: KBatch (default: 1)\n");
+        exit(0);
+    }
+
+    // Transpose the AScale tensor for better performance
+    ck::index_t Scale_Stride_AK = (M + Scale_Block_M - 1) / Scale_Block_M;
+    ck::index_t Scale_Stride_BN = (K + Scale_Block_K - 1) / Scale_Block_K;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return ck::HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return ck::HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    ck::Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
+    ck::Tensor<A1DataType> a1_m_k(f_host_tensor_descriptor((M + Scale_Block_M - 1) / Scale_Block_M,
+                                                           (K + Scale_Block_K - 1) / Scale_Block_K,
+                                                           Scale_Stride_AK,
+                                                           A1Layout{}));
+    ck::Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
+    ck::Tensor<B0DataType> b0_preshuffled(
+        f_host_tensor_descriptor(K, N, StrideB, B0Layout{})); // use laout only for size
+    ck::Tensor<B1DataType> b1_k_n(f_host_tensor_descriptor((K + Scale_Block_K - 1) / Scale_Block_K,
+                                                           (N + Scale_Block_N - 1) / Scale_Block_N,
+                                                           Scale_Stride_BN,
+                                                           B0Layout{}));
+    ck::Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    ck::Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a0_m_k: " << a0_m_k.mDesc << std::endl;
+    std::cout << "a1_m_k: " << a1_m_k.mDesc << std::endl;
+    std::cout << "b0_k_n: " << b0_k_n.mDesc << std::endl;
+    std::cout << "b1_k_n: " << b1_k_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0, 1.0});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+        break;
+    case 2:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_1<A1DataType>{});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
+        break;
+    case 3:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_1<A1DataType>{});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
+        break;
+    case 4:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0, 1.0});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
+        break;
+    case 5:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_1<A1DataType>{});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+        break;
+    default:
+        a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{-0.5, 0.5});
+        b0_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        a1_m_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0, 1.0});
+        b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
+    }
+
+    ck::DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
+    ck::DeviceMem a1_device_buf(sizeof(A1DataType) * a1_m_k.mDesc.GetElementSpaceSize());
+    ck::DeviceMem b0_device_buf(sizeof(B0DataType) * b0_k_n.mDesc.GetElementSpaceSize());
+    ck::DeviceMem b1_device_buf(sizeof(B1DataType) * b1_k_n.mDesc.GetElementSpaceSize());
+    ck::DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a0_device_buf.ToDevice(a0_m_k.mData.data());
+    a1_device_buf.ToDevice(a1_m_k.mData.data());
+    b1_device_buf.ToDevice(b1_k_n.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    constexpr ck::index_t NumDTensor = DsDataType::Size();
+
+    // do GEMM
+    auto device_op      = DeviceOpInstance{};
+    std::string op_name = device_op.GetTypeString();
+    int NPerWmma        = device_op.GetPreShuffleParameters();
+
+    preShuffleBuffer<KPack>(b0_k_n.mData.data(), b0_preshuffled.mData.data(), N, K, NPerWmma);
+
+    b0_device_buf.ToDevice(b0_preshuffled.mData.data());
+    auto invoker  = device_op.MakeInvoker();
+    auto argument = device_op.MakeArgument(a0_device_buf.GetDeviceBuffer(),
+                                           b0_device_buf.GetDeviceBuffer(),
+                                           std::array<const void*, NumDTensor>{},
+                                           e_device_buf.GetDeviceBuffer(),
+                                           M,
+                                           N,
+                                           K,
+                                           StrideA,
+                                           StrideB,
+                                           std::array<ck::index_t, NumDTensor>{},
+                                           StrideE,
+                                           a1_device_buf.GetDeviceBuffer(),
+                                           b1_device_buf.GetDeviceBuffer(),
+                                           a_element_op,
+                                           b_element_op,
+                                           cde_element_op,
+                                           KBatch);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
+
+    float ave_time = 0.0f;
+
+    if(flush_cache)
+    {
+        int rotating_buf = (512 * 1024 * 1024 + num_btype - 1) / num_btype;
+
+        ave_time = invoker.Run(argument,
+                               StreamConfig{nullptr, time_kernel, 0, 50, 100, true, rotating_buf});
+    }
+    else
+    {
+        ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel, 0, 50, 100});
+    }
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << op_name << ", KBatch " << KBatch << std::endl;
+
+    if(do_verification)
+    {
+        ck::Tensor<AccDataType> c_m_n({M, N});
+        ck::Tensor<float> a_m_k({M, K});
+        ck::Tensor<float> b_k_n({K, N});
+
+        for(int m = 0; m < M; m++)
+        {
+            for(int k = 0; k < K; k++)
+            {
+                a_m_k(m, k) = ck::type_convert<float>(a0_m_k(m, k)) *
+                              a1_m_k(m / Scale_Block_M, k / Scale_Block_K);
+            }
+        }
+
+        for(int n = 0; n < N; n++)
+        {
+            for(int k = 0; k < K; k++)
+            {
+                b_k_n(k, n) = ck::type_convert<float>(b0_k_n(k, n)) *
+                              b1_k_n(k / Scale_Block_K, n / Scale_Block_N);
+            }
+        }
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<float,
+                                                                                float,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                PassThrough,
+                                                                                PassThrough,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, PassThrough{}, PassThrough{}, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                e_m_n_host_result(m, n) = ck::type_convert<EDataType>(c_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(
+                   e_m_n_device_result, e_m_n_host_result, "Error: Incorrect results!", 5e-2, 5e-2)
+                   ? 0
+                   : 1;
+    }
+
+    return 0;
+}
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -770,7 +770,7 @@ def create_kernel(

 class CompatibilityRuleFactory:
    @staticmethod
-    def get_rules() -> list[CompatibilityRule]:
+    def get_rules() -> List[CompatibilityRule]:
        # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not
        def check_mode(problem_ctx: ProblemContext, kernel_ctx: KernelContext) -> bool:
            if problem_ctx.mode == "group":
@@ -812,7 +812,7 @@ class CompatibilityRuleFactoryGfx9(CompatibilityRuleFactory):
    _AVAILABLE_PIPELINES = frozenset({"qr", "qr_async", "qs"})

    @classmethod
-    def get_rules(cls) -> list[CompatibilityRule]:
+    def get_rules(cls) -> List[CompatibilityRule]:
        rules = CompatibilityRuleFactory.get_rules()

        def check_hdim_tile(
@@ -846,7 +846,7 @@ class CompatibilityRuleFactoryGfx950(CompatibilityRuleFactoryGfx9):
    )

    @classmethod
-    def get_rules(cls) -> list[CompatibilityRule]:
+    def get_rules(cls) -> List[CompatibilityRule]:
        rules = CompatibilityRuleFactoryGfx9.get_rules()

        def check_tile_pipeline(
--- a/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
+++ b/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
@@ -17,6 +17,7 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
        gemm_aquant_quantgrouped_preshufflequant.cpp
        gemm_bquant_quantgrouped_bf8i4.cpp
        gemm_bquant_quantgrouped_fp8i4.cpp
+        gemm_bquant_quantgrouped_bf16mxfp4.cpp
        gemm_bquant_quantgrouped_bf8.cpp
        gemm_bquant_quantgrouped_fp8.cpp
        gemm_bquant_quantgrouped_preshuffleb.cpp
--- a/example/ck_tile/38_block_scale_gemm/README.md
+++ b/example/ck_tile/38_block_scale_gemm/README.md
@@ -23,7 +23,7 @@ This folder contains examples of quant GEMMs using the ck_tile tile-programming
 - **Preshuffled GEMM**: Shuffle the GEMM of B (weight) matrix in the warp layout and bypass the shared memory to do the GEMM calculation. Best performance solution for GEMM.
 - **TransposeC**: Transpose the C Matrix Output layout to have the best coalesced scale reading
 - **Preshuffled Quant**: Preshuffle the input matrix to load multiple Quant warp blocks along the selected dimension.
- **Precision**: Supports fp16, bf16, fp8, bf8, int4 (for B Matrix).
+- **Precision**: Supports fp16, bf16, fp8, bf8, int4 (for B Matrix), uint8 (split into two fp4 in the pipeline (for B Matrix)).
 - **Validation**: CPU/GPU validation and error tolerance options.

 ## build
@@ -53,7 +53,7 @@ args:
        -stride_b    Tensor B stride (default:0)
        -stride_c    Tensor C stride (default:0)
               -v    0: No validation, 1: Validation on CPU, 2: Validation on GPU (default:1)
-            -prec    Data type. For AQuant: fp8, bf8, i4fp8, or i4bf8;  for Bquant: fp8, bf8, fp8i4, or bf8i4 (default for both AQuant and Bquant: fp8)
+            -prec    Data type. For AQuant: fp8, bf8, i4fp8, or i4bf8;  for Bquant: fp8, bf8, fp8i4, bf8i4, or bf16fp4 (default for both AQuant and Bquant: fp8)
          -warmup    Number of iterations before benchmarking the kernel (default:50)
          -repeat    Number of iterations to benchmark the kernel (default:1000)
           -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
--- a/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_bf16mxfp4.cpp
+++ b/example/ck_tile/38_block_scale_gemm/gemm_bquant_quantgrouped_bf16mxfp4.cpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) , Advanced Micro Devices, Inc. All rights reserved.
+
+#include "run_gemm_quant_example.inc"
+
+template <typename T>
+using GemmConfig = GemmConfigQuantPrefill<T>;
+
+#define RUN_GEMM_EXAMPLE_PREC_TYPE                                \
+    run_gemm_example_prec_type<GemmConfig<ck_tile::pk_fp4_raw_t>, \
+                               TypeConfig,                        \
+                               QuantGroupSize,                    \
+                               ck_tile::QuantType::BQuantGrouped>(arg_parser);
+
+void bquant_quantgrouped_bf16fp4_instance_factory(
+    std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut)
+{
+    using TypeConfig = decltype(GemmQuantTypeConfig<ck_tile::bf16_t,
+                                                    ck_tile::pk_fp4_raw_t,
+                                                    ck_tile::bf16_t,
+                                                    ck_tile::pk_fp4_raw_t>{});
+
+    lut[hash_multiple_strings(
+        {"bf16fp4", "bquant", "non-preshuffleb", "non-preshufflequant", "1x1x32"})] =
+        [](const ck_tile::ArgParser& arg_parser) {
+            using QuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 32>>;
+            return RUN_GEMM_EXAMPLE_PREC_TYPE;
+        };
+    lut[hash_multiple_strings(
+        {"bf16fp4", "bquant", "non-preshuffleb", "non-preshufflequant", "1x1x64"})] =
+        [](const ck_tile::ArgParser& arg_parser) {
+            using QuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 64>>;
+            return RUN_GEMM_EXAMPLE_PREC_TYPE;
+        };
+    lut[hash_multiple_strings(
+        {"bf16fp4", "bquant", "non-preshuffleb", "non-preshufflequant", "1x1x128"})] =
+        [](const ck_tile::ArgParser& arg_parser) {
+            using QuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+            return RUN_GEMM_EXAMPLE_PREC_TYPE;
+        };
+}
--- a/example/ck_tile/38_block_scale_gemm/gemm_quant.cpp
+++ b/example/ck_tile/38_block_scale_gemm/gemm_quant.cpp
@@ -32,7 +32,7 @@ auto create_args(int argc, char* argv[])
        .insert("prec",
                "fp8",
                "Data type. For AQuant: fp8, bf8, i4fp8, or i4bf8;  for Bquant: fp8, bf8, fp8i4, "
-                "or bf8i4;  for ABQuant: fp8, bf8")
+                "bf8i4 or bf16fp4;  for ABQuant: fp8, bf8")
        .insert("warmup", "50", "Number of iterations before benchmarking the kernel")
        .insert("repeat", "1000", "Number of iterations to benchmark the kernel")
        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
@@ -109,6 +109,8 @@ void bquant_quantgrouped_fp8i4_instance_factory(
    std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut);
 void bquant_quantgrouped_bf8i4_instance_factory(
    std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut);
+void bquant_quantgrouped_bf16fp4_instance_factory(
+    std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut);
 void bquant_quantgrouped_preshuffleb_instance_factory(
    std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut);
 void bquant_quantgrouped_preshufflequant_instance_factory(
@@ -141,6 +143,7 @@ int main(int argc, char* argv[])
    bquant_quantgrouped_bf8_instance_factory(lut);
    bquant_quantgrouped_fp8i4_instance_factory(lut);
    bquant_quantgrouped_bf8i4_instance_factory(lut);
+    bquant_quantgrouped_bf16fp4_instance_factory(lut);
    bquant_quantgrouped_preshuffleb_instance_factory(lut);
    bquant_quantgrouped_preshufflequant_instance_factory(lut);
    bquant_quantgrouped_preshuffleb_preshufflequant_instance_factory(lut);
--- a/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
+++ b/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
@@ -69,8 +69,10 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
                         const ck_tile::index_t kbatch,
                         const float max_accumulated_value)
 {
-    using ComputeType =
-        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    using ComputeType = std::conditional_t<
+        std::is_same_v<BDataType, ck_tile::pk_fp4_raw_t>,
+        ADataType,
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>>;
    // Calculate thresholds
    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
        ck_tile::integer_divide_ceil(K, kbatch));
--- a/example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc
+++ b/example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc
@@ -145,21 +145,25 @@ float gemm_calc_quant(const ck_tile::QuantGemmHostArgs& args, const ck_tile::str
                                                        has_hot_loop_v,
                                                        tail_number_v>>>>;

-        using GemmPipeline = std::conditional_t<
-            QuantMode == ck_tile::QuantType::RowColQuant ||
-                QuantMode == ck_tile::QuantType::TensorQuant,
-            ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>,
-            std::conditional_t<
-                QuantMode == ck_tile::QuantType::AQuantGrouped,
-                std::conditional_t<GemmConfig::PreshuffleQuant == true,
-                                   ck_tile::AQuantGemmPipelineAgBgCrCompV3<PipelineProblem>,
-                                   ck_tile::AQuantGemmPipelineAgBgCrMem<PipelineProblem>>,
-                std::conditional_t<
-                    QuantMode == ck_tile::QuantType::BQuantGrouped,
-                    std::conditional_t<GemmConfig::PreshuffleB == true,
-                                       ck_tile::WPQuantBPipelineAgBgCrV2<PipelineProblem>,
-                                       ck_tile::BQuantGemmPipelineAgBgCrCompV3<PipelineProblem>>,
-                    ck_tile::ABQuantGemmPipelineAgBgCrCompV3<PipelineProblem>>>>;
+        using GemmPipeline = std::conditional_t < QuantMode == ck_tile::QuantType::RowColQuant ||
+                             QuantMode == ck_tile::QuantType::TensorQuant,
+              ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>,
+              std::conditional_t<
+                  QuantMode == ck_tile::QuantType::AQuantGrouped,
+                  std::conditional_t<GemmConfig::PreshuffleQuant == true,
+                                     ck_tile::AQuantGemmPipelineAgBgCrCompV3<PipelineProblem>,
+                                     ck_tile::AQuantGemmPipelineAgBgCrMem<PipelineProblem>>,
+                  std::conditional_t<
+                      GemmConfig::PreshuffleB == true,
+                      ck_tile::WPQuantBPipelineAgBgCrV2<PipelineProblem>,
+                      std::conditional_t<
+                          std::is_same_v<typename TypeConfig::BDataType, ck_tile::pk_fp4_raw_t>,
+                          ck_tile::MxFp4GemmPipelineAgBgCrCompV3<PipelineProblem>,
+                          std::conditional_t<
+                              QuantMode == ck_tile::QuantType::ABQuantGrouped,
+                              ck_tile::ABQuantGemmPipelineAgBgCrCompV3<
+                                  PipelineProblem,
+                                  ck_tile::BQuantGemmPipelineAgBgCrCompV3<PipelineProblem>>>>>>;

        constexpr bool TiledPermuteN =
            (BQuantGroupSize::kN > 1) ? false : GemmConfig::TiledMMAPermuteN;
@@ -168,28 +172,31 @@ float gemm_calc_quant(const ck_tile::QuantGemmHostArgs& args, const ck_tile::str
            printf(
                "TiledPermuteN: %d (QuantGroupSize::kN=%d)\n", TiledPermuteN, BQuantGroupSize::kN);
        }
-        using GemmEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<typename TypeConfig::ADataType,
-                                             typename TypeConfig::BDataType,
-                                             ck_tile::tuple<>,
-                                             typename TypeConfig::AccDataType,
-                                             typename TypeConfig::CDataType,
-                                             ck_tile::tuple<>,
-                                             CLayout,
-                                             CDEElementWise,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             GemmConfig::M_Warp,
-                                             GemmConfig::N_Warp,
-                                             GemmConfig::M_Warp_Tile,
-                                             GemmConfig::N_Warp_Tile,
-                                             GemmConfig::K_Warp_Tile,
-                                             transpose_c,
-                                             ck_tile::memory_operation_enum::set,
-                                             1,
-                                             false,
-                                             1,
-                                             TiledPermuteN>>;
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
+            typename TypeConfig::ADataType,
+            std::conditional_t<
+                std::is_same_v<typename TypeConfig::BDataType, ck_tile::pk_fp4_raw_t>,
+                typename TypeConfig::ADataType,
+                typename TypeConfig::BDataType>,
+            ck_tile::tuple<>,
+            typename TypeConfig::AccDataType,
+            typename TypeConfig::CDataType,
+            ck_tile::tuple<>,
+            CLayout,
+            CDEElementWise,
+            TilePartitioner::MPerBlock,
+            TilePartitioner::NPerBlock,
+            GemmConfig::M_Warp,
+            GemmConfig::N_Warp,
+            GemmConfig::M_Warp_Tile,
+            GemmConfig::N_Warp_Tile,
+            GemmConfig::K_Warp_Tile,
+            transpose_c,
+            ck_tile::memory_operation_enum::set,
+            1,
+            false,
+            1,
+            TiledPermuteN>>;
        using Kernel =
            ck_tile::QuantGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue, QuantMode>;

@@ -226,7 +233,11 @@ float gemm_calc_quant(const ck_tile::QuantGemmHostArgs& args, const ck_tile::str
            ck_tile::HostTensor<typename TypeConfig::ADataType> a_m(ck_tile::host_tensor_descriptor(
                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
            ck_tile::HostTensor<typename TypeConfig::BDataType> b_n(ck_tile::host_tensor_descriptor(
-                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+                std::is_same_v<typename TypeConfig::BDataType, ck_tile::pk_fp4_raw_t> ? args.K / 2
+                                                                                      : args.K,
+                args.N,
+                args.stride_B,
+                is_row_major(BLayout{})));

            auto size_a_buffer = a_m.get_element_space_size_in_bytes();
            auto size_b_buffer = b_n.get_element_space_size_in_bytes();
@@ -484,7 +495,11 @@ int run_gemm_example_with_layouts(const ck_tile::ArgParser& arg_parser,
    int rotating_count           = arg_parser.get_int("rotating_count");

    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
-    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
+    stride_B = ck_tile::get_default_stride(
+        (std::is_same_v<BDataType, ck_tile::pk_fp4_raw_t>) ? (K / 2) : K,
+        N,
+        stride_B,
+        is_row_major(b_layout));
    stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));

    // Conditional stride calculation based on QuantMode
@@ -516,8 +531,11 @@ int run_gemm_example_with_layouts(const ck_tile::ArgParser& arg_parser,

    ck_tile::HostTensor<ADataType> a_m_k(
        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
-    ck_tile::HostTensor<BDataType> b_k_n(
-        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
+    ck_tile::HostTensor<BDataType> b_k_n(ck_tile::host_tensor_descriptor(
+        (std::is_same_v<BDataType, ck_tile::pk_fp4_raw_t>) ? (K / 2) : K,
+        N,
+        stride_B,
+        is_row_major(b_layout)));
    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));

@@ -563,13 +581,22 @@ int run_gemm_example_with_layouts(const ck_tile::ArgParser& arg_parser,
            {
                ck_tile::FillUniformDistribution<ck_tile::pk_int4_t>{-5.0f, 5.0f, fill_seed(gen)}(
                    b_k_n);
+                ck_tile::FillUniformDistribution<BQDataType>{-2.0f, 2.0f, fill_seed(gen)}(
+                    *bq_tensor_ptr);
+            }
+            else if constexpr(std::is_same_v<BDataType, ck_tile::pk_fp4_raw_t>)
+            {
+                ck_tile::FillUniformDistribution<BDataType>{-5.0f, 5.0f, fill_seed(gen)}(b_k_n);
+                ck_tile::FillUniformDistribution<BQDataType>{125.f, 130.f, fill_seed(gen)}(
+                    *bq_tensor_ptr);
            }
            else
            {
                ck_tile::FillUniformDistribution<BDataType>{-2.0f, 3.0f, fill_seed(gen)}(b_k_n);
+                ck_tile::FillUniformDistribution<BQDataType>{-2.0f, 2.0f, fill_seed(gen)}(
+                    *bq_tensor_ptr);
            }
-            ck_tile::FillUniformDistribution<BQDataType>{-2.0f, 2.0f, fill_seed(gen)}(
-                *bq_tensor_ptr);
+
            ck_tile::FillUniformDistribution<ADataType>{-5.0f, 5.0f, fill_seed(gen)}(a_m_k);
        }
        else if constexpr(QuantMode == ck_tile::QuantType::AQuantGrouped)
@@ -817,13 +844,23 @@ int run_gemm_example_with_layouts(const ck_tile::ArgParser& arg_parser,
        }
        else if constexpr(QuantMode == ck_tile::QuantType::BQuantGrouped)
        {
-            ck_tile::reference_gemm_quant<ADataType,
-                                          AQDataType,
-                                          BDataType,
-                                          AccDataType,
-                                          CDataType,
-                                          BQuantGroupSize,
-                                          false>(a_m_k, *bq_tensor_ptr, b_k_n, c_m_n_host_ref);
+            if constexpr(std::is_same_v<BDataType, ck_tile::pk_fp4_raw_t>)
+                ck_tile::reference_mxfp4gemm_quant<ADataType,
+                                                   BQDataType,
+                                                   BDataType,
+                                                   AccDataType,
+                                                   CDataType,
+                                                   BQuantGroupSize,
+                                                   false>(
+                    a_m_k, *bq_tensor_ptr, b_k_n, c_m_n_host_ref);
+            else
+                ck_tile::reference_gemm_quant<ADataType,
+                                              AQDataType,
+                                              BDataType,
+                                              AccDataType,
+                                              CDataType,
+                                              BQuantGroupSize,
+                                              false>(a_m_k, *bq_tensor_ptr, b_k_n, c_m_n_host_ref);
        }
        else if constexpr(QuantMode == ck_tile::QuantType::ABQuantGrouped)
        {
@@ -896,16 +933,18 @@ int run_gemm_example_prec_type(const ck_tile::ArgParser& arg_parser)
    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;

    if((QuantMode == ck_tile::QuantType::AQuantGrouped ||
-        QuantMode == ck_tile::QuantType::RowColQuant) &&
+        QuantMode == ck_tile::QuantType::RowColQuant ||
+        std::is_same_v<typename TypeConfig::BDataType, ck_tile::pk_fp4_raw_t>) &&
       GemmConfig::PreshuffleB)
    {
        throw std::runtime_error(
-            "Preshuffling weight matrix is not supported for AQuant or RowColQuant");
+            "Preshuffling weight matrix is not supported for AQuant, RowColQuant or bf16_fp4_gemm");
    }

    if constexpr(std::is_same_v<typename TypeConfig::ADataType, ck_tile::pk_int4_t> ||
                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::fp8_t> ||
-                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::bf8_t>)
+                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::bf8_t> ||
+                 std::is_same_v<typename TypeConfig::ADataType, ck_tile::bf16_t>)
    {
        std::string a_layout = arg_parser.get_str("a_layout");
        std::string b_layout = arg_parser.get_str("b_layout");
--- a/example/test_old_ck_gpu_reference.cpp
+++ b/example/test_old_ck_gpu_reference.cpp
@@ -1,5 +1,5 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.

 // Standalone test program for Old CK GPU references
 // Tests naive_conv_fwd (existing) and future backward ops