gemm/Conv xdlops + dlops quantization (#625)

* Add conv perlayer quantization * Add gemm_dlops quantization * Support int8 for innerproduct * Refine gemm dlops int8 kernel parameter * Support gfx908(MI100) and gfx90a(MI200) * clang-format * Rename example number * Support different layout for d tensor * Add conv dlops perchannel quantization example * Move to example 40 * Extract the common code for different platform (dlops and xdlops) * Move ot subfolder. Prepare to add other op of quantization * Refine the quantization instance library * Add conv dl instances and client example * Remove unnecessary type * Add gemm quantization instance * Add external api and client example * Refine num_bytes * Separete different layout to different cpp * Add more xdl instances * Revert "Remove unnecessary type" This reverts commit 820869182f. * Remove CShuffleDataType in dlops Let acc and CShuffleDataType be the same in xdlops --------- Co-authored-by: zjing14 <zhangjing14@gmail.com>
2026-05-04 05:31:24 +00:00 · 2023-03-16 04:29:40 +08:00
parent a2d5ca8e95
commit 16dc18e0f9
60 changed files with 3186 additions and 717 deletions
--- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -134,7 +134,8 @@ __global__ void
            const Block2CTileMap block_2_ctile_map,
            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__) || \
+    defined(__gfx90a__) || defined(__gfx908__))
    // offset base pointer for each work-group
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -314,9 +315,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
        const auto in_gemmm_gemmk_desc =
            matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_desc);

-        const auto M = in_gemmm_gemmk_desc.GetLength(I0);
-        const auto K = in_gemmm_gemmk_desc.GetLength(I1);
-
+        const auto M   = in_gemmm_gemmk_desc.GetLength(I0);
+        const auto K   = in_gemmm_gemmk_desc.GetLength(I1);
        const auto AK0 = K / K1;

        return transform_tensor_descriptor(
@@ -709,7 +709,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
        namespace ctc = tensor_layout::convolution;

        // check device
-        if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030"))
+        if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030" ||
+             ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx908"))
        {
            return false;
        }
@@ -834,6 +835,7 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
        {
            return false;
        }
+
        // check Gridwise GEMM
        return GridwiseGemm::CheckValidity(
            arg.a_grid_desc_ak0_m_ak1_, arg.b_grid_desc_bk0_n_bk1_, arg.e_grid_desc_m_n_);
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp
@@ -51,7 +51,7 @@ __global__ void
            const Block2CTileMap block_2_ctile_map)
 {
 #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
-    defined(__gfx1030__))
+    defined(__gfx90a__) || defined(__gfx1030__))

    constexpr index_t shared_block_size =
        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(ABDataType);
@@ -552,7 +552,7 @@ struct DeviceGemmMultipleD_Dl : public DeviceGemmMultipleD<ALayout,
    static bool IsSupportedArgument(const Argument& arg)
    {
        if(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx908" ||
-           ck::get_device_name() == "gfx1030")
+           ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx1030")
        {
            return GridwiseGemm::CheckValidity(
                arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.e_grid_desc_m_n_);