Fix v1; use M padding

2026-07-01 20:27:42 +00:00 · 2025-05-26 10:32:26 +00:00
parent 40af523e2c
commit 91eb136937
5 changed files with 72 additions and 37 deletions
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_mx.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v1_mx.hpp
@@ -253,8 +253,8 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
            b_scale_thread_desc.GetElementSpaceSize());

        // Global prefetch 1
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
-        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+        a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_buf);
+        b_blockwise_copy.Run(b_grid_desc, b_grid_buf, b_block_desc, b_block_buf);

        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
@@ -303,8 +303,8 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
            make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));

        // Local prefill 1
-        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
-        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+        __builtin_amdgcn_s_waitcnt(3952);
+        block_sync_lds();

        // Initialize C
        c_thread_buf.Clear();
@@ -317,13 +317,8 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
            do
            {
                // -------------------------------------------------------------------------------------------
-                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
-                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);

-                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-
-                block_sync_lds();
+                // wait previous blockwise copy to finish

                // k indexes mapping to threads for 32x32x64:
                // t0 : |0  --> 15 32 --> 47 | 64 --> 79 96  --> 111 | etc.
@@ -387,6 +382,13 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
                    });
                });

+                // load for next k loop
+                block_sync_lds();
+                a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_buf);
+                b_blockwise_copy.Run(b_grid_desc, b_grid_buf, b_block_desc, b_block_buf);
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
                static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
                    static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
@@ -507,9 +509,8 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
                    b_scale_grid_desc,
                    make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));

+                __builtin_amdgcn_s_waitcnt(3952);
                block_sync_lds();
-                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
-                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);

                i += 1;
            } while(i < (num_loop - 1));
@@ -518,7 +519,6 @@ struct BlockwiseGemmXdlops_pipeline_v1_mx<BlockGemmPipelineScheduler::Intrawave,
        // tail
        if constexpr(TailNum == TailNumber::Full)
        {
-            block_sync_lds();
            static_for<0, KRepeat, 1>{}([&](auto k) {
                constexpr auto k_step =
                    k * xdlops_gemm.KPerXdlops * KPack / xdlops_gemm.K1PerXdlops;
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp
@@ -327,13 +327,31 @@ struct DeviceGemmMX_Xdl_CShuffleV3 : public DeviceGemmMX<ALayout,
                       KBatch_cond_choice.value == (arg.KBatch > 1) &&
                       tail_num_choice.value == tail_num)
                    {
-                        const auto kernel = kernel_gemm_xdl_cshuffle_v3_2lds< //
-                            GridwiseGemm,
-                            mainloop_choice.value,
-                            CGlobalMemoryDataOperation,
-                            minimum_occupancy,
-                            tail_num_choice.value>;
-                        Run(kernel);
+                        if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3< //
+                                GridwiseGemm,
+                                mainloop_choice.value,
+                                CGlobalMemoryDataOperation,
+                                minimum_occupancy,
+                                tail_num_choice.value>;
+                            Run(kernel);
+                        }
+                        else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                        {
+
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_2lds< //
+                                GridwiseGemm,
+                                mainloop_choice.value,
+                                CGlobalMemoryDataOperation,
+                                minimum_occupancy,
+                                tail_num_choice.value>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            static_assert(false, "Unexpected BlkGemmPipelineVer!");
+                        }
                    }
                });
            return ave_time;
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
@@ -331,12 +331,28 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
            // pad M, but not K
            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
                a_grid_desc_mraw_kraw,
-                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                make_tuple(make_unmerge_transform(make_tuple(K / KPerBlock, AK0Number, AK1Value)),
                           make_right_pad_transform(M, MPad - M)),
                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));

-            return a_grid_desc_ak0_m_ak1;
+            const auto a_grid_desc_permuted = transform_tensor_descriptor(
+                a_grid_desc_ak0_m_ak1,
+                make_tuple(make_pass_through_transform(K / KPerBlock),
+                           make_xor_with_modulo_transform(make_tuple(MPad, AK0Number)),
+                           make_pass_through_transform(AK1Value)),
+                make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<2, 1>{}, Sequence<3>{}));
+
+            const auto a_grid_desc = transform_tensor_descriptor(
+                a_grid_desc_permuted,
+                make_tuple(
+                    make_merge_transform_v3_division_mod(make_tuple(K / KPerBlock, AK0Number)),
+                    make_pass_through_transform(MPad),
+                    make_pass_through_transform(AK1Value)),
+                make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+            return a_grid_desc;
        }
        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
                          GemmSpec == GemmSpecialization::NKPadding)
@@ -408,8 +424,9 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
                        GemmSpec != GemmSpecialization::Default),
                      "pk_i4_t does not support padding");
        static_assert(!(is_same_v<remove_cvref_t<ADataType>, f4x2_pk_t> &&
-                        GemmSpec != GemmSpecialization::Default),
-                      "f4x2_pk_t does not support padding");
+                        (GemmSpec != GemmSpecialization::Default &&
+                         GemmSpec != GemmSpecialization::MPadding)),
+                      "f4x2_pk_t does not support K padding");

        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
                     GemmSpec == GemmSpecialization::MNKPadding)
@@ -1357,6 +1374,10 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
    static_assert(KXdlPack * NXdlPack % scale_pack_size_b == 0,
                  "B scale pack data type too large!");

+    static_assert(is_same_v<AElementwiseOperation, tensor_operation::element_wise::PassThrough> &&
+                      is_same_v<BElementwiseOperation, tensor_operation::element_wise::PassThrough>,
+                  "A/B ElementwiseOperation should be PassThrough as load_to_lds is used!");
+
    template <typename AGridDesc_AK0_M_K1,
              typename AScaleGridDesc_AM_AK,
              typename BGridDesc_BK0_N_K1,
@@ -1394,8 +1415,6 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
        const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_b_scale_grid, b_scale_grid_desc_bn_ak.GetElementSpaceSize());

-        const AElementwiseOperation a_element_op{};
-        const BElementwiseOperation b_element_op{};
        const CElementwiseOperation c_element_op{};

        // divide block work by [M, N]
@@ -1895,9 +1914,6 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
        const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_b_scale_grid, b_scale_grid_desc_bn_ak.GetElementSpaceSize());

-        static_assert(
-            is_same_v<AElementwiseOperation, tensor_operation::element_wise::PassThrough> &&
-            is_same_v<BElementwiseOperation, tensor_operation::element_wise::PassThrough>);
        const CElementwiseOperation c_element_op{};

        // divide block work by [M, N]
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn.hpp
@@ -29,6 +29,7 @@ using PassThrough = element_wise::PassThrough;

 static constexpr auto GemmDefault    = GemmSpecialization::Default;
 static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
+static constexpr auto GemmMPadding   = GemmSpecialization::MPadding;
 static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
 static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;

@@ -44,12 +45,12 @@ using device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn_instances = std::tuple<
    //#############################|        |        |        | Type|   Data| Type|   Data|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|           Size|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
    //#############################|        |        |        |     |   Type|     |   Type|      |        |         |   Operation|   Operation|   Operation|              |               |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
    //#############################|        |        |        |     |       |     |       |      |        |         |            |            |            |              |               |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-      // DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   128,   128,  16,  16,  16,   16,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      // DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   256,   128,  16,  16,  16,   16,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      // DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   128,   128,  16,  16,  16,   16,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      // DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   256,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      // DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    96,   128,   128,  16,  16,  16,   16,    6,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
-      // DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    96,   256,   128,  16,  16,  16,   16,    6,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   128,   128,  16,  16,  16,   16,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    32,   256,   128,  16,  16,  16,   16,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   128,   128,  16,  16,  16,   16,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    64,   256,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    96,   128,   128,  16,  16,  16,   16,    6,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,
+      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,    96,   256,   128,  16,  16,  16,   16,    6,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v1>,

      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   256,   256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
      DeviceGemmMX_Xdl_CShuffleV3<       Row,     Col,     Row,   F4, E8M0PK,   F4, E8M0PK,  F16,     F32,      F16, PassThrough, PassThrough, PassThrough,      GemmSpec, ScaleBlockSize,   256,   128,   256,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,     true,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,              2,             16,             16,     true,           2,           2,                   S<1, 32, 1, 8>,               8,  BlkGemmPipeSched,  BlockGemmPipelineVersion::v3>,
--- a/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_mx/device_gemm_mx_xdl_f4_f4_f16/device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn_default_instance.cpp
@@ -23,7 +23,7 @@ void add_device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn_default_instances(
                                             PassThrough>>>& instances)
 {
    add_device_operation_instances(
-        instances, device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn_instances<Intrawave, GemmDefault>{});
+        instances, device_gemm_mx_xdl_f4_f4_f16_mk_nk_mn_instances<Intrawave, GemmMPadding>{});
 }

 } // namespace instance