Allow distinct K0/K1 values for A/B block descriptor (#98)

* add gitignore * host tensor: allow generating sequentially increasing value in a given dimension * gridwise gemm v3r1: allow distinct K0/K1 values for A/B block descriptor - remove dangling header include - modify example gemm_xdl accordingly - infer KPack value from M/NPerXdl - device conv2d fwd: update parameters accordingly for the underlying gridwise gemm v3r1 (API for conv2d fwd stays the same for now until we decide to expose individual K0s for activation and weight) * add LDS data dump utility * profiler: reflect API change for distinct K0/K1 for A/B matrices * profiler: add conflict-free LDS write FP16 kernel instances * fix accidental perf regression * address feedback; cosmetic changes * clang-format for new files * format Co-authored-by: Chao Liu <chao.liu2@amd.com>
2026-05-11 08:50:17 +00:00 · 2022-02-28 11:06:18 +08:00
parent e221d11e51
commit 6d4450ef15
14 changed files with 431 additions and 268 deletions
--- a/device_operation/include/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/device_operation/include/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -432,10 +432,11 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
        OutElementwiseOperation,
        MPerBlock,
        NPerBlock,
-        K0PerBlock,
+        K0PerBlock * K1,
+        K1, // AK1
+        K1, // BK1
        MPerXdl,
        NPerXdl,
-        K1,
        MXdlPerWave,
        NXdlPerWave,
        ABlockTransferThreadClusterLengths_K0_M_K1,
--- a/device_operation/include/device_gemm_xdl_c_shuffle.hpp
+++ b/device_operation/include/device_gemm_xdl_c_shuffle.hpp
@@ -29,8 +29,9 @@ template <
    ck::index_t BlockSize,
    ck::index_t MPerBlock,
    ck::index_t NPerBlock,
-    ck::index_t K0PerBlock,
-    ck::index_t K1,
+    ck::index_t KPerBlock,
+    ck::index_t AK1,
+    ck::index_t BK1,
    ck::index_t MPerXDL,
    ck::index_t NPerXDL,
    ck::index_t MXdlPerWave,
@@ -61,13 +62,11 @@ struct DeviceGemmXdl_C_Shuffle
    static constexpr auto I1 = Number<1>{};
    static constexpr auto I2 = Number<2>{};

-    static constexpr auto K1Number = Number<K1>{};
-
    static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA)
    {
-        assert(K % K1 == 0);
+        assert(K % AK1 == 0);

-        const index_t K0 = K / K1;
+        const index_t K0 = K / AK1;

        const auto a_grid_desc_m_k = [&]() {
            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
@@ -80,21 +79,20 @@ struct DeviceGemmXdl_C_Shuffle
            }
        }();

-        const auto a_grid_desc_k0_m_k1 =
-            transform_tensor_descriptor(a_grid_desc_m_k,
-                                        make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                                                   make_pass_through_transform(M)),
-                                        make_tuple(Sequence<1>{}, Sequence<0>{}),
-                                        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        const auto a_grid_desc_k0_m_k1 = transform_tensor_descriptor(
+            a_grid_desc_m_k,
+            make_tuple(make_unmerge_transform(make_tuple(K0, AK1)), make_pass_through_transform(M)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));

        return a_grid_desc_k0_m_k1;
    }

    static auto MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t StrideB)
    {
-        assert(K % K1 == 0);
+        assert(K % BK1 == 0);

-        const index_t K0 = K / K1;
+        const index_t K0 = K / BK1;

        const auto b_grid_desc_k_n = [&]() {
            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
@@ -107,12 +105,11 @@ struct DeviceGemmXdl_C_Shuffle
            }
        }();

-        const auto b_grid_desc_k0_n_k1 =
-            transform_tensor_descriptor(b_grid_desc_k_n,
-                                        make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
-                                                   make_pass_through_transform(N)),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        const auto b_grid_desc_k0_n_k1 = transform_tensor_descriptor(
+            b_grid_desc_k_n,
+            make_tuple(make_unmerge_transform(make_tuple(K0, BK1)), make_pass_through_transform(N)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));

        return b_grid_desc_k0_n_k1;
    }
@@ -148,10 +145,11 @@ struct DeviceGemmXdl_C_Shuffle
        CElementwiseOperation,
        MPerBlock,
        NPerBlock,
-        K0PerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
        MPerXDL,
        NPerXDL,
-        K1,
        MXdlPerWave,
        NXdlPerWave,
        ABlockTransferThreadClusterLengths_K0_M_K1,
@@ -461,7 +459,9 @@ struct DeviceGemmXdl_C_Shuffle
            << BlockSize << ", "
            << MPerBlock << ", "
            << NPerBlock << ", "
-            << K0PerBlock
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1
            << ">";
        // clang-format on