Reorganize project folders (#6)

2026-05-23 22:34:36 +00:00 · 2025-04-30 13:46:39 -04:00
commit 1eb2e57380
3952 changed files with 654944 additions and 0 deletions
--- a/example/22_cgemm/CMakeLists.txt
+++ b/example/22_cgemm/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_custom_target(example_cgemm_xdl)
+
+add_example_executable(example_cgemm_xdl_bf16 cgemm_xdl_bf16.cpp)
+add_example_dependencies(example_cgemm_xdl example_cgemm_xdl_bf16)
+
+add_example_executable(example_cgemm_xdl_fp16 cgemm_xdl_fp16.cpp)
+add_example_dependencies(example_cgemm_xdl example_cgemm_xdl_fp16)
+
+add_example_executable(example_cgemm_xdl_fp32 cgemm_xdl_fp32.cpp)
+add_example_dependencies(example_cgemm_xdl example_cgemm_xdl_fp32)
+
+add_example_executable(example_cgemm_xdl_int8 cgemm_xdl_int8.cpp)
+add_example_dependencies(example_cgemm_xdl example_cgemm_xdl_int8)
+
+if(USE_BITINT_EXTENSION_INT4)
+    add_example_executable(example_cgemm_xdl_int4 cgemm_xdl_int4.cpp)
+    add_example_dependencies(example_cgemm_xdl example_cgemm_xdl_int4)
+endif()
--- a/example/22_cgemm/cgemm_xdl_bf16.cpp
+++ b/example/22_cgemm/cgemm_xdl_bf16.cpp
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "cgemm_xdl_common.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+using ADataType   = BF16;
+using BDataType   = BF16;
+using CDataType   = BF16;
+using AccDataType = F32;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using ReferenceCGemmInstance = ck::tensor_operation::host::
+    ReferenceCGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
+
+// clang-format off
+using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_CShuffle
+    <ALayout,                    // typename ALayout
+     BLayout,                    // typename BLayout
+     CLayout,                    // typename CLayout
+     ADataType,                  // typename ADataType
+     BDataType,                  // typename BDataType
+     CDataType,                  // typename CDataType
+     AccDataType,                // typename GemmAccDataType
+     CDataType,                  // typename CShuffleDataType
+     PassThrough,                // typename AElementwiseOperation
+     PassThrough,                // typename BElementwiseOperation
+     PassThrough,                // typename CElementwiseOperation
+     GemmDefault,                // GemmSpecialization GemmSpec
+     1,                          // index_t NumGemmKPrefetchStage
+     256,                        // index_t BlockSize
+     256,                        // index_t MPerBlock
+     128,                        // index_t NPerBlock
+     32,                         // index_t KPerBlock
+     8,                          // index_t AK1
+     8,                          // index_t BK1
+     32,                         // index_t MPerXDL
+     32,                         // index_t NPerXDL
+     4,                          // index_t MXdlPerWave
+     2,                          // index_t NXdlPerWave
+     S<4, 64, 1>,                // typename ABlockTransferThreadClusterLengths_AK0_M_AK1
+     S<1, 0, 2>,                 // typename ABlockTransferThreadClusterArrangeOrder
+     S<1, 0, 2>,                 // typename ABlockTransferSrcAccessOrder
+     2,                          // index_t ABlockTransferSrcVectorDim
+     8,                          // index_t ABlockTransferSrcScalarPerVector
+     8,                          // index_t ABlockTransferDstScalarPerVector_AK1
+     1,                          // index_t ABlockLdsExtraM
+     S<4, 64, 1>,                // typename BBlockTransferThreadClusterLengths_BK0_N_BK1
+     S<1, 0, 2>,                 // typename BBlockTransferThreadClusterArrangeOrder
+     S<1, 0, 2>,                 // typename BBlockTransferSrcAccessOrder
+     2,                          // index_t BBlockTransferSrcVectorDim
+     8,                          // index_t BBlockTransferSrcScalarPerVector
+     8,                          // index_t BBlockTransferDstScalarPerVector_BK1
+     1,                          // index_t BBlockLdsExtraN
+     1,                          // index_t CShuffleMXdlPerWavePerShuffle
+     1,                          // index_t CShuffleNXdlPerWavePerShuffle
+     S<1, 32, 1, 8>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+     8>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // CGEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 416;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideC = std::stoi(argv[9]);
+    }
+    else
+    {
+        std::cout << "arg1: verification (0=no, 1=yes)\n"
+                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+                  << "arg3: run kernel # of times (>1)\n"
+                  << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"
+                  << std::endl;
+        exit(0);
+    }
+
+    return !run_cgemm_xdl<ADataType,
+                          BDataType,
+                          CDataType,
+                          ALayout,
+                          BLayout,
+                          CLayout,
+                          PassThrough,
+                          PassThrough,
+                          PassThrough,
+                          DeviceCGemmInstance,
+                          ReferenceCGemmInstance>(
+        M, N, K, StrideA, StrideB, StrideC, do_verification, init_method, time_kernel);
+}
--- a/example/22_cgemm/cgemm_xdl_common.hpp
+++ b/example/22_cgemm/cgemm_xdl_common.hpp
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/stream_config.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16   = ck::half_t;
+using F32   = float;
+using BF16  = ck::bhalf_t;
+using INT8  = std::int8_t;
+using INT32 = std::int32_t;
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+using INT4 = ck::int4_t;
+#endif
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename DeviceCGemmInstance,
+          typename ReferenceCGemmInstance,
+          typename KernelADataType = ADataType,
+          typename KernelBDataType = BDataType,
+          typename KernelCDataType = CDataType>
+bool run_cgemm_xdl(ck::index_t M,
+                   ck::index_t N,
+                   ck::index_t K,
+                   ck::index_t StrideA,
+                   ck::index_t StrideB,
+                   ck::index_t StrideC,
+                   bool do_verification,
+                   int init_method,
+                   bool time_kernel)
+{
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    static_assert(sizeof(ck::int4_t) == sizeof(int8_t),
+                  "sizeof ck::int4_t and int8_t is different!");
+    static_assert(sizeof(ADataType) == sizeof(KernelADataType),
+                  "sizeof ADataType and KernelADataType is different!");
+    static_assert(sizeof(BDataType) == sizeof(KernelBDataType),
+                  "sizeof BDataType and KernelBDataType is different!");
+    static_assert(sizeof(CDataType) == sizeof(KernelCDataType),
+                  "sizeof CDataType and KernelCDataType is different!");
+#endif
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k_real(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<ADataType> a_m_k_imag(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n_real(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BDataType> b_k_n_imag(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<KernelCDataType> c_m_n_real_device_result(
+        f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<KernelCDataType> c_m_n_imag_device_result(
+        f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k_real: " << a_m_k_real.mDesc << std::endl;
+    std::cout << "a_m_k_imag: " << a_m_k_imag.mDesc << std::endl;
+    std::cout << "b_k_n_real: " << b_k_n_real.mDesc << std::endl;
+    std::cout << "b_k_n_imag: " << b_k_n_imag.mDesc << std::endl;
+    std::cout << "c_m_n_real: " << c_m_n_real_device_result.mDesc << std::endl;
+    std::cout << "c_m_n_imag: " << c_m_n_imag_device_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k_real.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        a_m_k_imag.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n_real.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        b_k_n_imag.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        break;
+    default:
+        a_m_k_real.GenerateTensorValue(GeneratorTensor_3<ADataType>{-0.5, 0.5});
+        a_m_k_imag.GenerateTensorValue(GeneratorTensor_3<ADataType>{-0.5, 0.5});
+        b_k_n_real.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        b_k_n_imag.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+    }
+
+    auto cgemm = DeviceCGemmInstance{};
+
+    DeviceMem a_m_k_real_device_buf(sizeof(KernelADataType) *
+                                    a_m_k_real.mDesc.GetElementSpaceSize());
+    DeviceMem a_m_k_imag_device_buf(sizeof(KernelADataType) *
+                                    a_m_k_imag.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_real_device_buf(sizeof(KernelBDataType) *
+                                    b_k_n_real.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_imag_device_buf(sizeof(KernelBDataType) *
+                                    b_k_n_imag.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_real_device_buf(sizeof(KernelCDataType) *
+                                    c_m_n_real_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_imag_device_buf(sizeof(KernelCDataType) *
+                                    c_m_n_imag_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem workspace_device_buf(cgemm.GetWorkspaceSize(M, N, K, StrideA, StrideB, StrideC));
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    if constexpr(std::is_same_v<ADataType, ck::int4_t>)
+    {
+        Tensor<KernelADataType> a_m_k_real_converted(a_m_k_real);
+        Tensor<KernelADataType> a_m_k_imag_converted(a_m_k_imag);
+        Tensor<KernelBDataType> b_k_n_real_converted(b_k_n_real);
+        Tensor<KernelBDataType> b_k_n_imag_converted(b_k_n_imag);
+
+        a_m_k_real_device_buf.ToDevice(a_m_k_real_converted.mData.data());
+        a_m_k_imag_device_buf.ToDevice(a_m_k_imag_converted.mData.data());
+        b_k_n_real_device_buf.ToDevice(b_k_n_real_converted.mData.data());
+        b_k_n_imag_device_buf.ToDevice(b_k_n_imag_converted.mData.data());
+    }
+    else
+#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    {
+        a_m_k_real_device_buf.ToDevice(a_m_k_real.mData.data());
+        a_m_k_imag_device_buf.ToDevice(a_m_k_imag.mData.data());
+        b_k_n_real_device_buf.ToDevice(b_k_n_real.mData.data());
+        b_k_n_imag_device_buf.ToDevice(b_k_n_imag.mData.data());
+    }
+
+    auto a_element_op = AElementwiseOperation{};
+    auto b_element_op = BElementwiseOperation{};
+    auto c_element_op = CElementwiseOperation{};
+
+    // do GEMM
+    auto invoker = cgemm.MakeInvoker();
+    auto argument =
+        cgemm.MakeArgument(static_cast<KernelADataType*>(a_m_k_real_device_buf.GetDeviceBuffer()),
+                           static_cast<KernelADataType*>(a_m_k_imag_device_buf.GetDeviceBuffer()),
+                           static_cast<KernelBDataType*>(b_k_n_real_device_buf.GetDeviceBuffer()),
+                           static_cast<KernelBDataType*>(b_k_n_imag_device_buf.GetDeviceBuffer()),
+                           static_cast<KernelCDataType*>(c_m_n_real_device_buf.GetDeviceBuffer()),
+                           static_cast<KernelCDataType*>(c_m_n_imag_device_buf.GetDeviceBuffer()),
+                           static_cast<KernelCDataType*>(workspace_device_buf.GetDeviceBuffer()),
+                           M,
+                           N,
+                           K,
+                           StrideA,
+                           StrideB,
+                           StrideC,
+                           a_element_op,
+                           b_element_op,
+                           c_element_op);
+
+    if(!cgemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_cgemm with the specified compilation parameters does "
+            "not support this CGEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(8) * M * N * K;
+    std::size_t num_btype =
+        std::size_t(2) *
+        (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N);
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << cgemm.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        Tensor<CDataType> c_m_n_real_host_result(
+            f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+        Tensor<CDataType> c_m_n_imag_host_result(
+            f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+        auto ref_cgemm    = ReferenceCGemmInstance{};
+        auto ref_invoker  = ref_cgemm.MakeInvoker();
+        auto ref_argument = ref_cgemm.MakeArgument(a_m_k_real,
+                                                   a_m_k_imag,
+                                                   b_k_n_real,
+                                                   b_k_n_imag,
+                                                   c_m_n_real_host_result,
+                                                   c_m_n_imag_host_result,
+                                                   a_element_op,
+                                                   b_element_op,
+                                                   c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        c_m_n_real_device_buf.FromDevice(c_m_n_real_device_result.mData.data());
+        c_m_n_imag_device_buf.FromDevice(c_m_n_imag_device_result.mData.data());
+
+        bool result = true;
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        if constexpr(std::is_same_v<ADataType, ck::int4_t>)
+        {
+            const Tensor<CDataType> c_m_n_real_device_result_converted(c_m_n_real_device_result);
+            const Tensor<CDataType> c_m_n_imag_device_result_converted(c_m_n_imag_device_result);
+
+            result = ck::utils::check_err(c_m_n_real_device_result_converted,
+                                          c_m_n_real_host_result,
+                                          "Verification error: incorrect results in real part!",
+                                          1e-2f,
+                                          1e-1f);
+            result = result && ck::utils::check_err(
+                                   c_m_n_imag_device_result_converted,
+                                   c_m_n_imag_host_result,
+                                   "Verification error: incorrect results in imaginary part!",
+                                   1e-2f,
+                                   1e-1f);
+        }
+        else
+#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        {
+            result = ck::utils::check_err(c_m_n_real_device_result,
+                                          c_m_n_real_host_result,
+                                          "Verification error: incorrect results in real part!",
+                                          1e-2f,
+                                          1e-1f);
+            result = result && ck::utils::check_err(
+                                   c_m_n_imag_device_result,
+                                   c_m_n_imag_host_result,
+                                   "Verification error: incorrect results in imaginary part!",
+                                   1e-2f,
+                                   1e-1f);
+        }
+
+        return result;
+    }
+    return true;
+}
--- a/example/22_cgemm/cgemm_xdl_fp16.cpp
+++ b/example/22_cgemm/cgemm_xdl_fp16.cpp
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "cgemm_xdl_common.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+using ADataType        = F16;
+using BDataType        = F16;
+using CDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using ReferenceCGemmInstance = ck::tensor_operation::host::
+    ReferenceCGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
+
+// clang-format off
+using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_CShuffle
+    <ALayout,                    // typename ALayout
+     BLayout,                    // typename BLayout
+     CLayout,                    // typename CLayout
+     ADataType,                  // typename ADataType
+     BDataType,                  // typename BDataType
+     CDataType,                  // typename CDataType
+     AccDataType,                // typename GemmAccDataType
+     CShuffleDataType,           // typename CShuffleDataType
+     PassThrough,                // typename AElementwiseOperation
+     PassThrough,                // typename BElementwiseOperation
+     PassThrough,                // typename CElementwiseOperation
+     GemmDefault,                // GemmSpecialization GemmSpec
+     1,                          // index_t NumGemmKPrefetchStage
+     256,                        // index_t BlockSize
+     256,                        // index_t MPerBlock
+     128,                        // index_t NPerBlock
+     32,                         // index_t KPerBlock
+     8,                          // index_t AK1
+     8,                          // index_t BK1
+     32,                         // index_t MPerXDL
+     32,                         // index_t NPerXDL
+     4,                          // index_t MXdlPerWave
+     2,                          // index_t NXdlPerWave
+     S<4, 64, 1>,                // typename ABlockTransferThreadClusterLengths_AK0_M_AK1
+     S<1, 0, 2>,                 // typename ABlockTransferThreadClusterArrangeOrder
+     S<1, 0, 2>,                 // typename ABlockTransferSrcAccessOrder
+     2,                          // index_t ABlockTransferSrcVectorDim
+     8,                          // index_t ABlockTransferSrcScalarPerVector
+     8,                          // index_t ABlockTransferDstScalarPerVector_AK1
+     1,                          // index_t ABlockLdsExtraM
+     S<4, 64, 1>,                // typename BBlockTransferThreadClusterLengths_BK0_N_BK1
+     S<1, 0, 2>,                 // typename BBlockTransferThreadClusterArrangeOrder
+     S<1, 0, 2>,                 // typename BBlockTransferSrcAccessOrder
+     2,                          // index_t BBlockTransferSrcVectorDim
+     8,                          // index_t BBlockTransferSrcScalarPerVector
+     8,                          // index_t BBlockTransferDstScalarPerVector_BK1
+     1,                          // index_t BBlockLdsExtraN
+     1,                          // index_t CShuffleMXdlPerWavePerShuffle
+     1,                          // index_t CShuffleNXdlPerWavePerShuffle
+     S<1, 32, 1, 8>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+     8>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // CGEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideC = std::stoi(argv[9]);
+    }
+    else
+    {
+        std::cout << "arg1: verification (0=no, 1=yes)\n"
+                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+                  << "arg3: run kernel # of times (>1)\n"
+                  << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"
+                  << std::endl;
+        exit(0);
+    }
+
+    return !run_cgemm_xdl<ADataType,
+                          BDataType,
+                          CDataType,
+                          ALayout,
+                          BLayout,
+                          CLayout,
+                          PassThrough,
+                          PassThrough,
+                          PassThrough,
+                          DeviceCGemmInstance,
+                          ReferenceCGemmInstance>(
+        M, N, K, StrideA, StrideB, StrideC, do_verification, init_method, time_kernel);
+}
--- a/example/22_cgemm/cgemm_xdl_fp32.cpp
+++ b/example/22_cgemm/cgemm_xdl_fp32.cpp
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "cgemm_xdl_common.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+using ADataType   = F32;
+using BDataType   = F32;
+using CDataType   = F32;
+using AccDataType = F32;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using ReferenceCGemmInstance = ck::tensor_operation::host::
+    ReferenceCGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
+
+// clang-format off
+using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_CShuffle
+    <ALayout,                    // typename ALayout
+     BLayout,                    // typename BLayout
+     CLayout,                    // typename CLayout
+     ADataType,                  // typename ADataType
+     BDataType,                  // typename BDataType
+     CDataType,                  // typename CDataType
+     AccDataType,                // typename GemmAccDataType
+     CDataType,                  // typename CShuffleDataType
+     PassThrough,                // typename AElementwiseOperation
+     PassThrough,                // typename BElementwiseOperation
+     PassThrough,                // typename CElementwiseOperation
+     GemmDefault,                // GemmSpecialization GemmSpec
+     1,                          // index_t NumGemmKPrefetchStage
+     256,                        // index_t BlockSize
+     256,                        // index_t MPerBlock
+     128,                        // index_t NPerBlock
+     16,                         // index_t KPerBlock
+     4,                          // index_t AK1
+     4,                          // index_t BK1
+     32,                         // index_t MPerXDL
+     32,                         // index_t NPerXDL
+     4,                          // index_t MXdlPerWave
+     2,                          // index_t NXdlPerWave
+     S<4, 64, 1>,                // typename ABlockTransferThreadClusterLengths_AK0_M_AK1
+     S<1, 0, 2>,                 // typename ABlockTransferThreadClusterArrangeOrder
+     S<1, 0, 2>,                 // typename ABlockTransferSrcAccessOrder
+     2,                          // index_t ABlockTransferSrcVectorDim
+     4,                          // index_t ABlockTransferSrcScalarPerVector
+     4,                          // index_t ABlockTransferDstScalarPerVector_AK1
+     1,                          // index_t ABlockLdsExtraM
+     S<4, 64, 1>,                // typename BBlockTransferThreadClusterLengths_BK0_N_BK1
+     S<1, 0, 2>,                 // typename BBlockTransferThreadClusterArrangeOrder
+     S<1, 0, 2>,                 // typename BBlockTransferSrcAccessOrder
+     2,                          // index_t BBlockTransferSrcVectorDim
+     4,                          // index_t BBlockTransferSrcScalarPerVector
+     4,                          // index_t BBlockTransferDstScalarPerVector_BK1
+     1,                          // index_t BBlockLdsExtraN
+     1,                          // index_t CShuffleMXdlPerWavePerShuffle
+     1,                          // index_t CShuffleNXdlPerWavePerShuffle
+     S<1, 16, 1, 16>,            // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+     4>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // CGEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideC = std::stoi(argv[9]);
+    }
+    else
+    {
+        std::cout << "arg1: verification (0=no, 1=yes)\n"
+                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+                  << "arg3: run kernel # of times (>1)\n"
+                  << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"
+                  << std::endl;
+        exit(0);
+    }
+
+    return !run_cgemm_xdl<ADataType,
+                          BDataType,
+                          CDataType,
+                          ALayout,
+                          BLayout,
+                          CLayout,
+                          PassThrough,
+                          PassThrough,
+                          PassThrough,
+                          DeviceCGemmInstance,
+                          ReferenceCGemmInstance>(
+        M, N, K, StrideA, StrideB, StrideC, do_verification, init_method, time_kernel);
+}
--- a/example/22_cgemm/cgemm_xdl_int4.cpp
+++ b/example/22_cgemm/cgemm_xdl_int4.cpp
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "cgemm_xdl_common.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+using ADataType        = INT4;
+using BDataType        = INT4;
+using CDataType        = INT4;
+using AccDataType      = INT32;
+using CShuffleDataType = INT32;
+
+using KernelADataType = INT8;
+using KernelBDataType = INT8;
+using KernelCDataType = INT8;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using ReferenceCGemmInstance = ck::tensor_operation::host::
+    ReferenceCGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
+
+// clang-format off
+using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_CShuffle
+    <ALayout,                    // typename ALayout
+     BLayout,                    // typename BLayout
+     CLayout,                    // typename CLayout
+     KernelADataType,            // typename ADataType
+     KernelBDataType,            // typename BDataType
+     KernelCDataType,            // typename CDataType
+     AccDataType,                // typename GemmAccDataType
+     CShuffleDataType,           // typename CShuffleDataType
+     PassThrough,                // typename AElementwiseOperation
+     PassThrough,                // typename BElementwiseOperation
+     PassThrough,                // typename CElementwiseOperation
+     GemmDefault,                // GemmSpecialization GemmSpec
+     1,                          // index_t NumGemmKPrefetchStage
+     256,                        // index_t BlockSize
+     256,                        // index_t MPerBlock
+     128,                        // index_t NPerBlock
+     64,                         // index_t KPerBlock
+     16,                         // index_t AK1
+     16,                         // index_t BK1
+     32,                         // index_t MPerXDL
+     32,                         // index_t NPerXDL
+     4,                          // index_t MXdlPerWave
+     2,                          // index_t NXdlPerWave
+     S<4, 64, 1>,                // typename ABlockTransferThreadClusterLengths_AK0_M_AK1
+     S<1, 0, 2>,                 // typename ABlockTransferThreadClusterArrangeOrder
+     S<1, 0, 2>,                 // typename ABlockTransferSrcAccessOrder
+     2,                          // index_t ABlockTransferSrcVectorDim
+     16,                         // index_t ABlockTransferSrcScalarPerVector
+     16,                         // index_t ABlockTransferDstScalarPerVector_AK1
+     1,                          // index_t ABlockLdsExtraM
+     S<4, 64, 1>,                // typename BBlockTransferThreadClusterLengths_BK0_N_BK1
+     S<1, 0, 2>,                 // typename BBlockTransferThreadClusterArrangeOrder
+     S<1, 0, 2>,                 // typename BBlockTransferSrcAccessOrder
+     2,                          // index_t BBlockTransferSrcVectorDim
+     8,                          // index_t BBlockTransferSrcScalarPerVector
+     8,                          // index_t BBlockTransferDstScalarPerVector_BK1
+     1,                          // index_t BBlockLdsExtraN
+     1,                          // index_t CShuffleMXdlPerWavePerShuffle
+     1,                          // index_t CShuffleNXdlPerWavePerShuffle
+     S<1, 64, 1, 4>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+     16>;                        // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // CGEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1152;
+    ck::index_t K = 512;
+
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = K;
+    ck::index_t StrideC = N;
+
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideC = std::stoi(argv[9]);
+    }
+    else
+    {
+        std::cout << "arg1: verification (0=no, 1=yes)\n"
+                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+                  << "arg3: time kernel (0=no, 1=yes)\n"
+                  << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"
+                  << std::endl;
+        exit(EXIT_SUCCESS);
+    }
+
+    return !run_cgemm_xdl<ADataType,
+                          BDataType,
+                          CDataType,
+                          ALayout,
+                          BLayout,
+                          CLayout,
+                          PassThrough,
+                          PassThrough,
+                          PassThrough,
+                          DeviceCGemmInstance,
+                          ReferenceCGemmInstance,
+                          KernelADataType,
+                          KernelBDataType,
+                          KernelCDataType>(
+        M, N, K, StrideA, StrideB, StrideC, do_verification, init_method, time_kernel);
+}
--- a/example/22_cgemm/cgemm_xdl_int8.cpp
+++ b/example/22_cgemm/cgemm_xdl_int8.cpp
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "cgemm_xdl_common.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+using ADataType   = INT8;
+using BDataType   = INT8;
+using CDataType   = INT8;
+using AccDataType = INT32;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using ReferenceCGemmInstance = ck::tensor_operation::host::
+    ReferenceCGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
+
+// clang-format off
+using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_CShuffle
+    <ALayout,                    // typename ALayout
+     BLayout,                    // typename BLayout
+     CLayout,                    // typename CLayout
+     ADataType,                  // typename ADataType
+     BDataType,                  // typename BDataType
+     CDataType,                  // typename CDataType
+     AccDataType,                // typename GemmAccDataType
+     CDataType,                  // typename CShuffleDataType
+     PassThrough,                // typename AElementwiseOperation
+     PassThrough,                // typename BElementwiseOperation
+     PassThrough,                // typename CElementwiseOperation
+     GemmDefault,                // GemmSpecialization GemmSpec
+     1,                          // index_t NumGemmKPrefetchStage
+     256,                        // index_t BlockSize
+     256,                        // index_t MPerBlock
+     128,                        // index_t NPerBlock
+     64,                         // index_t KPerBlock
+     16,                         // index_t AK1
+     16,                         // index_t BK1
+     32,                         // index_t MPerXDL
+     32,                         // index_t NPerXDL
+     4,                          // index_t MXdlPerWave
+     2,                          // index_t NXdlPerWave
+     S<4, 64, 1>,                // typename ABlockTransferThreadClusterLengths_AK0_M_AK1
+     S<1, 0, 2>,                 // typename ABlockTransferThreadClusterArrangeOrder
+     S<1, 0, 2>,                 // typename ABlockTransferSrcAccessOrder
+     2,                          // index_t ABlockTransferSrcVectorDim
+     16,                         // index_t ABlockTransferSrcScalarPerVector
+     16,                         // index_t ABlockTransferDstScalarPerVector_AK1
+     1,                          // index_t ABlockLdsExtraM
+     S<4, 64, 1>,                // typename BBlockTransferThreadClusterLengths_BK0_N_BK1
+     S<1, 0, 2>,                 // typename BBlockTransferThreadClusterArrangeOrder
+     S<1, 0, 2>,                 // typename BBlockTransferSrcAccessOrder
+     2,                          // index_t BBlockTransferSrcVectorDim
+     8,                          // index_t BBlockTransferSrcScalarPerVector
+     8,                          // index_t BBlockTransferDstScalarPerVector_BK1
+     1,                          // index_t BBlockLdsExtraN
+     1,                          // index_t CShuffleMXdlPerWavePerShuffle
+     1,                          // index_t CShuffleNXdlPerWavePerShuffle
+     S<1, 64, 1, 4>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+     16>;                        // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // CGEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideC = std::stoi(argv[9]);
+    }
+    else
+    {
+        std::cout << "arg1: verification (0=no, 1=yes)\n"
+                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+                  << "arg3: run kernel # of times (>1)\n"
+                  << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"
+                  << std::endl;
+        exit(0);
+    }
+
+    return !run_cgemm_xdl<ADataType,
+                          BDataType,
+                          CDataType,
+                          ALayout,
+                          BLayout,
+                          CLayout,
+                          PassThrough,
+                          PassThrough,
+                          PassThrough,
+                          DeviceCGemmInstance,
+                          ReferenceCGemmInstance>(
+        M, N, K, StrideA, StrideB, StrideC, do_verification, init_method, time_kernel);
+}