Merge branch 'develop' into moe_gemm_activation

2026-06-29 11:16:59 +00:00 · 2025-04-09 15:15:00 +08:00
parent fbf91ada78 03ce8729fd
commit 4c2abb376a
11 changed files with 134 additions and 51 deletions
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -10,7 +10,7 @@ Composable Kernel User Guide

 The Composable Kernel library provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs and CPUs, through general purpose kernel languages such as `HIP C++ <https://rocm.docs.amd.com/projects/HIP/en/latest/index.html>`_.

-The Composable Kernel repository is located at `https://github.com/ROCm/composable-kernel <https://github.com/ROCm/composable-kernel>`_.
+The Composable Kernel repository is located at `https://github.com/ROCm/composable_kernel <https://github.com/ROCm/composable_kernel>`_.

 .. grid:: 2
  :gutter: 3
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core==1.18.1
+rocm-docs-core==1.18.2
 sphinxcontrib-bibtex==2.6.3
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -199,7 +199,7 @@ requests==2.32.3
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.18.1
+rocm-docs-core==1.18.2
    # via -r requirements.in
 rpds-py==0.22.3
    # via
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -357,6 +357,12 @@ struct PassThrough
        y = type_convert<half_t>(x);
    }

+    template <>
+    __host__ __device__ void operator()<float, int32_t>(float& y, const int32_t& x) const
+    {
+        y = type_convert<float>(x);
+    }
+
    template <>
    __host__ __device__ void operator()<bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x) const
    {
--- a/include/ck/utility/sequence.hpp
+++ b/include/ck/utility/sequence.hpp
@@ -184,6 +184,21 @@ struct Sequence
    }
 };

+namespace impl {
+template <typename T, T... Ints>
+struct __integer_sequence;
+
+template <index_t... Ints>
+struct __integer_sequence<index_t, Ints...>
+{
+    using seq_type = Sequence<Ints...>;
+};
+} // namespace impl
+
+template <index_t N>
+using make_index_sequence =
+    typename __make_integer_seq<impl::__integer_sequence, index_t, N>::seq_type;
+
 // merge sequence
 template <typename Seq, typename... Seqs>
 struct sequence_merge
--- a/include/ck/utility/tuple_helper.hpp
+++ b/include/ck/utility/tuple_helper.hpp
@@ -11,11 +11,16 @@

 namespace ck {

+template <typename F, index_t... ids>
+__host__ __device__ constexpr auto generate_tuple_for(F&& f, Sequence<ids...>)
+{
+    return make_tuple(f(Number<ids>{})...);
+}
+
 template <typename F, index_t N>
 __host__ __device__ constexpr auto generate_tuple(F&& f, Number<N>)
 {
-    return unpack([&f](auto&&... xs) { return make_tuple(f(xs)...); },
-                  typename arithmetic_sequence_gen<0, N, 1>::type{});
+    return generate_tuple_for(f, make_index_sequence<N>{});
 }

 template <typename F, index_t N>
--- a/include/ck_tile/core/container/tuple.hpp
+++ b/include/ck_tile/core/container/tuple.hpp
@@ -396,11 +396,16 @@ struct tuple_array_impl<T, 1>
 };
 } // namespace impl

+template <typename F, index_t... ids>
+CK_TILE_HOST_DEVICE constexpr auto generate_tuple_for(F&& f, sequence<ids...>)
+{
+    return make_tuple(f(number<ids>{})...);
+}
+
 template <typename F, index_t N>
 CK_TILE_HOST_DEVICE constexpr auto generate_tuple(F&& f, number<N>)
 {
-    return unpack([&f](auto&&... is) { return make_tuple(f(is)...); },
-                  typename arithmetic_sequence_gen<0, N, 1>::type{});
+    return generate_tuple_for(f, make_index_sequence<N>{});
 }

 template <typename F, index_t N>
--- a/include/ck_tile/core/tensor/tensor_view.hpp
+++ b/include/ck_tile/core/tensor/tensor_view.hpp
@@ -411,18 +411,21 @@ struct null_tensor_view
 };

 template <address_space_enum BufferAddressSpace = address_space_enum::generic,
+          amd_buffer_coherence_enum Coherence   = amd_buffer_coherence_enum::coherence_default,
          typename DataType,
          typename... Ts>
 CK_TILE_HOST_DEVICE constexpr auto make_tensor_view(DataType* p,
                                                    const tensor_descriptor<Ts...>& desc)
 {
-    auto buffer_view = make_buffer_view<BufferAddressSpace>(p, desc.get_element_space_size());
+    auto buffer_view =
+        make_buffer_view<BufferAddressSpace, Coherence>(p, desc.get_element_space_size());

    return tensor_view<decltype(buffer_view), decltype(desc)>{buffer_view, desc};
 }

 template <address_space_enum BufferAddressSpace = address_space_enum::generic,
          memory_operation_enum DstInMemOp      = memory_operation_enum::set,
+          amd_buffer_coherence_enum Coherence   = amd_buffer_coherence_enum::coherence_default,
          typename DataType,
          typename... Lengths,
          typename... Strides,
@@ -441,12 +444,14 @@ make_naive_tensor_view(DataType* p,
                                             number<GuaranteedLastDimensionVectorLength>{},
                                             number<GuaranteedLastDimensionVectorStride>{});

-    auto buffer_view = make_buffer_view<BufferAddressSpace>(p, desc.get_element_space_size());
+    auto buffer_view =
+        make_buffer_view<BufferAddressSpace, Coherence>(p, desc.get_element_space_size());

    return tensor_view<decltype(buffer_view), decltype(desc), DstInMemOp>{buffer_view, desc};
 }

 template <address_space_enum BufferAddressSpace = address_space_enum::generic,
+          amd_buffer_coherence_enum Coherence   = amd_buffer_coherence_enum::coherence_default,
          typename DataType,
          typename... Lengths,
          index_t GuaranteedLastDimensionVectorLength = -1>
@@ -458,7 +463,8 @@ make_naive_tensor_view_packed(DataType* p,
    auto desc =
        make_naive_tensor_descriptor_packed(lengths, number<GuaranteedLastDimensionVectorLength>{});

-    auto buffer_view = make_buffer_view<BufferAddressSpace>(p, desc.get_element_space_size());
+    auto buffer_view =
+        make_buffer_view<BufferAddressSpace, Coherence>(p, desc.get_element_space_size());

    return tensor_view<decltype(buffer_view), decltype(desc)>{buffer_view, desc};
 }
--- a/include/ck_tile/core/tensor/transpose_tile.hpp
+++ b/include/ck_tile/core/tensor/transpose_tile.hpp
@@ -83,9 +83,6 @@ CK_TILE_DEVICE void transpose_tile2d_impl_in_thread(OutTensor& out_tensor,
    constexpr index_t num_vec_in  = vec_length_out;
    constexpr index_t num_vec_out = vec_length_in;

-    using InVec  = array<DataType, vec_length_in>;
-    using OutVec = array<DataType, vec_length_out>;
-
    // SFC
    constexpr auto scalars_per_access_arr = generate_array(
        [&](auto i) { return (i == y_dim_vec_in or i == y_dim_vec_out) ? y_lengths[i] : 1; },
@@ -101,51 +98,84 @@ CK_TILE_DEVICE void transpose_tile2d_impl_in_thread(OutTensor& out_tensor,

    static_assert(num_access > 0, "wrong! num_access should be larger than 0");

-    // in/out vectors to be transposed
-    thread_buffer<InVec, num_vec_in> in_vectors;
-    thread_buffer<OutVec, num_vec_out> out_vectors;
+    if constexpr(num_vec_in == 1 || num_vec_out == 1)
+    {
+        // loop over SFC
+        static_for<0, num_access, 1>{}([&](auto iAccess) {
+            // data index [y0, y1, ...] in the order of input tensor
+            constexpr auto idx_y = SFC_Y::get_index(iAccess);

-    // loop over SFC and do transpose
-    static_for<0, num_access, 1>{}([&](auto iAccess) {
-        // data index [y0, y1, ...] in the order of input tensor
-        constexpr auto idx_y_start = SFC_Y::get_index(iAccess);
+            constexpr index_t in_offset  = y_in_desc.calculate_offset(idx_y);
+            constexpr index_t out_offset = y_out_desc.calculate_offset(idx_y);

-        // get input vectors
-        static_for<0, num_vec_in, 1>{}([&](auto i) {
-            constexpr auto idx_y_in = generate_tuple(
-                [&](auto ii) {
-                    return ii == y_dim_vec_out ? idx_y_start[ii] + i : idx_y_start[ii];
-                },
-                number<NDimY>{});
-
-            constexpr index_t in_offset = y_in_desc.calculate_offset(idx_y_in);
-            static_assert(in_offset % vec_length_in == 0);
-
-            in_vectors(i).template get_as<InVec>()(I0) =
-                in_tensor.get_thread_buffer()
-                    .template get_as<InVec>()[number<in_offset / vec_length_in>{}];
+            if constexpr(vec_length_in == 1)
+            {
+                out_tensor.get_thread_buffer()[number<out_offset>{}] =
+                    in_tensor.get_thread_buffer()[number<in_offset>{}];
+            }
+            else
+            {
+                using Vec = array<DataType, vec_length_in>;
+                out_tensor.get_thread_buffer().template get_as<Vec>(
+                    number<out_offset / vec_length_in>{}) =
+                    in_tensor.get_thread_buffer().template get_as<Vec>(
+                        number<in_offset / vec_length_in>{});
+            }
        });
+    }
+    else
+    {
+        using InVec  = array<DataType, vec_length_in>;
+        using OutVec = array<DataType, vec_length_out>;

-        // transpose
-        transpose_vectors<DataType, num_vec_in, num_vec_out>{}(in_vectors, out_vectors);
+        // in/out vectors to be transposed
+        thread_buffer<InVec, num_vec_in> in_vectors;
+        thread_buffer<OutVec, num_vec_out> out_vectors;

-        // set output vectors
-        static_for<0, num_vec_out, 1>{}([&](auto i) {
-            constexpr auto idx_y_out_tmp = generate_array(
-                [&](auto ii) { return ii == y_dim_vec_in ? idx_y_start[ii] + i : idx_y_start[ii]; },
-                number<NDimY>{});
+        // loop over SFC and do transpose
+        static_for<0, num_access, 1>{}([&](auto iAccess) {
+            // data index [y0, y1, ...] in the order of input tensor
+            constexpr auto idx_y_start = SFC_Y::get_index(iAccess);

-            constexpr auto idx_y_out =
-                container_reorder_given_new2old(idx_y_out_tmp, y_dim_out_to_in);
+            // get input vectors
+            static_for<0, num_vec_in, 1>{}([&](auto i) {
+                constexpr auto idx_y_in = generate_tuple(
+                    [&](auto ii) {
+                        return ii == y_dim_vec_out ? idx_y_start[ii] + i : idx_y_start[ii];
+                    },
+                    number<NDimY>{});

-            constexpr index_t out_offset = y_out_desc.calculate_offset(idx_y_out);
-            static_assert(out_offset % vec_length_out == 0);
+                constexpr index_t in_offset = y_in_desc.calculate_offset(idx_y_in);
+                static_assert(in_offset % vec_length_in == 0);

-            out_tensor.get_thread_buffer().template set_as<OutVec>(
-                number<out_offset / vec_length_out>{},
-                out_vectors[i].template get_as<OutVec>()[I0]);
+                in_vectors(i).template get_as<InVec>()(I0) =
+                    in_tensor.get_thread_buffer()
+                        .template get_as<InVec>()[number<in_offset / vec_length_in>{}];
+            });
+
+            // transpose
+            transpose_vectors<DataType, num_vec_in, num_vec_out>{}(in_vectors, out_vectors);
+
+            // set output vectors
+            static_for<0, num_vec_out, 1>{}([&](auto i) {
+                constexpr auto idx_y_out_tmp = generate_array(
+                    [&](auto ii) {
+                        return ii == y_dim_vec_in ? idx_y_start[ii] + i : idx_y_start[ii];
+                    },
+                    number<NDimY>{});
+
+                constexpr auto idx_y_out =
+                    container_reorder_given_new2old(idx_y_out_tmp, y_dim_out_to_in);
+
+                constexpr index_t out_offset = y_out_desc.calculate_offset(idx_y_out);
+                static_assert(out_offset % vec_length_out == 0);
+
+                out_tensor.get_thread_buffer().template set_as<OutVec>(
+                    number<out_offset / vec_length_out>{},
+                    out_vectors[i].template get_as<OutVec>()[I0]);
+            });
        });
-    });
+    }
 }

 } // namespace detail
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_bf16/device_gemm_multiply_multiply_wp_xdl_f8_f8_bf16_mk_mfma_mn.hpp
@@ -100,7 +100,17 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_
        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   128,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   256,   512,  16,  16,  16,   16,    1,    4,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   256,   512,  16,  16,  16,   16,    1,    4,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     32,    16,   512,  16,  16,  16,   16,    1,    1,     S<32, 4, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 4>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   128,  16,  16,  16,   16,    1,    1,     S<8, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   512,  16,  16,  16,   16,    1,    1,     S<32, 4, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     32,    64,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     64,    64,   512,  16,  16,  32,   32,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     64,    16,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 64, 1, 4>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     64,    16,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 64, 1, 4>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+
        // clang-format on
        >;

--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_wp/f8_f8_f16/device_gemm_multiply_multiply_wp_xdl_f8_f8_f16_mk_mfma_mn.hpp
@@ -115,7 +115,13 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p
        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   256,  16,  16,  16,   16,    1,    1,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   128,   256,  16,  16,  16,   16,    1,    2,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   256,   256,  16,  16,  16,   16,    1,    4,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   512,   256,  16,  16,  16,   16,    1,    8,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,   512,   256,  16,  16,  16,   16,    1,    8,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   512,  16,  16,  16,   16,    1,    1,     S<32, 4, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   128,  16,  16,  16,   16,    1,    1,     S<8, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   128,     16,    32,   256,  16,  16,  16,   16,    1,    1,     S<16, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   128,   8,  16,  16,   16,    1,    1,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+
        // clang-format on
        >;