From 0d0dc5d6461e1d1df358aad05174bde1c146e795 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Thu, 27 Jul 2023 20:29:44 +0200 Subject: [PATCH] Add s_nops after v_dot to avoid hazard (#808) * Add s_nops after v_dot to avoid hazard * Fix builtin for inner_produxt fp16 * Skip inline version to builtin * Add comments regarding isa * Fix comment regarding s_nop [ROCm/composable_kernel commit: 7761e5232c1d1451d17df14f63dfb48285e60c0d] --- include/ck/ck.hpp | 8 ++++++-- .../impl/device_batched_gemm_multiple_d_dl.hpp | 15 +++++++-------- include/ck/utility/inner_product.hpp | 18 +++++++++++++----- script/profile_batched_gemm.sh | 7 ------- test/batched_gemm_multi_d/CMakeLists.txt | 6 ++---- 5 files changed, 28 insertions(+), 26 deletions(-) diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index 6a1ac09021..aa40955b39 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -118,8 +118,12 @@ // inline asm #define CK_USE_AMD_INLINE_ASM 1 -// inner product (DLOP) -#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1 +// inner product (V_MAC/V_FMAC) +#define CK_USE_AMD_V_MAC_INLINE_ASM 1 + +// V_DOT inline instructions, less efficient since they require adding +// `s_nop`s to avoid hazard +#define CK_USE_AMD_V_DOT_INLINE_ASM 0 // block synchronization only s_wait lgkmcnt(0), not vmcnt(0) #define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1 diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp index 0df365ceb2..b51c600476 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp @@ -70,10 +70,9 @@ __global__ void const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const Block2CTileMap block_2_ctile_map) { -// TODO: Enable for gfx90a after complier fix -#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \ - defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || defined(__gfx1101__) || \ - defined(__gfx1102__)) +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \ + defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \ + defined(__gfx1101__) || defined(__gfx1102__)) const index_t num_blocks_per_batch = __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); @@ -650,10 +649,10 @@ struct DeviceBatchedGemmMultipleD_Dl : public DeviceBatchedGemmMultiD __device__ void inner_product(const float& a, const float& b, float& c) { -#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_MAC_F32) +#if CK_USE_AMD_V_MAC_INLINE_ASM && defined(CK_USE_AMD_V_MAC_F32) asm volatile("\n \ v_mac_f32 %0, %1, %2 \n \ " : "=v"(c) : "v"(a), "v"(b), "0"(c)); -#elif CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_FMAC_F32) +#elif CK_USE_AMD_V_MAC_INLINE_ASM && defined(CK_USE_AMD_V_FMAC_F32) asm volatile("\n \ v_fmac_f32 %0, %1, %2 \n \ " @@ -76,14 +76,18 @@ template <> __device__ void inner_product(const half2_t& a, const half2_t& b, float& c) { #if defined(CK_USE_AMD_V_DOT2_F32_F16) -#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM +#if CK_USE_AMD_V_DOT_INLINE_ASM + // Use 3 x s_nop to avoid hazard (mi200 cdna2 isa page 47 + // https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf + // ) s_nop with parameter 2 is equal to 3 x s_nop asm volatile("\n \ v_dot2_f32_f16 %0, %1, %2, %0\n \ + s_nop 2 \n \ " : "=v"(c) : "v"(a), "v"(b), "0"(c)); #else - c = __builtin_amdgcn_sdot2(a, b, c, false); + c = __builtin_amdgcn_fdot2(a, b, c, false); #endif #else const vector_type a_vector{a}; @@ -163,9 +167,13 @@ __device__ void inner_product(const int8x4_t& a, const int8x4_t& b, int32_t& c) { #if defined(CK_USE_AMD_V_DOT4_I32_I8) -#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM +#if CK_USE_AMD_V_DOT_INLINE_ASM + // Use 3 x s_nop to avoid hazard (mi200 cdna2 isa page 47 + // https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf + // ) s_nop with parameter 2 is equal to 3 x s_nop asm volatile("\n \ v_dot4_i32_i8 %0, %1, %2, %0\n \ + s_nop 2 \n \ " : "=v"(c) : "v"(bit_cast(a)), "v"(bit_cast(b)), "0"(c)); diff --git a/script/profile_batched_gemm.sh b/script/profile_batched_gemm.sh index d19ddd0c65..f90baaed68 100755 --- a/script/profile_batched_gemm.sh +++ b/script/profile_batched_gemm.sh @@ -3,13 +3,6 @@ ## GPU visibility export HIP_VISIBLE_DEVICES=0 DRIVER="../build/bin/ckProfiler" -OP=$1 -DATATYPE=$2 -LAYOUT=$3 -VERIFY=$4 -INIT=$5 -LOG=$6 -TIME=$7 OP=$1 DATATYPE=$2 diff --git a/test/batched_gemm_multi_d/CMakeLists.txt b/test/batched_gemm_multi_d/CMakeLists.txt index f8ceb545c2..102a5b8f27 100644 --- a/test/batched_gemm_multi_d/CMakeLists.txt +++ b/test/batched_gemm_multi_d/CMakeLists.txt @@ -1,7 +1,5 @@ # TODO: Enable for gfx90a after complier fix if(DL_KERNELS) - if(NOT GPU_TARGETS MATCHES "gfx90a") - add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp) - target_link_libraries(test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance) - endif() + add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp) + target_link_libraries(test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance) endif()