mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-20 14:59:17 +00:00
[CK_TILE] FMHA avoid unnecessary vmcnt0 (#2715)
* FMHA avoid unnecessary vmcnt0 Squashed commit of the following: commit7bdf6a7eefAuthor: aska-0096 <haocwang@amd.com> Date: Fri Aug 22 03:15:51 2025 +0000 merge develop and solve conflicts commitf21e916a8cMerge:a7dd2a7d10db21053eAuthor: aska-0096 <haocwang@amd.com> Date: Fri Aug 22 03:15:21 2025 +0000 Merge branch 'develop' of https://github.com/ROCm/composable_kernel into vmcnt0issue commita7dd2a7d13Author: Ding, Yi <yi.ding@amd.com> Date: Tue Aug 19 02:17:43 2025 +0000 update bwd commit380aa8f311Author: Kevin Choi <kevin.choi@amd.com> Date: Mon Aug 18 19:36:38 2025 +0000 add restrict to applicable functions commitb85daba2a3Author: Ding, Yi <yi.ding@amd.com> Date: Mon Aug 18 02:07:03 2025 +0000 bwd filter commit75c4b9372fAuthor: Kevin Choi <kevin.choi@amd.com> Date: Sat Aug 16 08:15:23 2025 +0000 remove noinline attr as it causes a lot more s_waitcnt's commit598e3fec41Author: Kevin Choi <kevin.choi@amd.com> Date: Thu Aug 14 12:11:17 2025 +0000 remove innerloop, move restrict parameters to mainloop and add noinline attribute. commit3340408537Author: Kevin Choi <kevin.choi@amd.com> Date: Thu Aug 14 07:06:51 2025 +0000 Create inner lambda with restrict parameters, add restrict to some parameters commit3bc45ecbc7Author: aska-0096 <haocwang@amd.com> Date: Thu Aug 14 03:43:54 2025 +0000 save for debug commitde4db6c4c5Merge:108abf00e68694cb78Author: aska-0096 <haocwang@amd.com> Date: Wed Aug 13 02:15:22 2025 +0000 Merge branch 'wip-async-tr-fa' of https://github.com/ROCm/composable_kernel into wip-async-tr-fa commit108abf00e0Merge:0810799e20f42a92fcAuthor: aska-0096 <haocwang@amd.com> Date: Wed Aug 13 02:14:26 2025 +0000 Merge branch 'develop' of https://github.com/ROCm/composable_kernel into wip-async-tr-fa commit68694cb781Merge:0810799e220288caa2Author: asleepzzz <hanwen.chang@amd.com> Date: Wed Aug 13 00:34:11 2025 +0800 Merge branch 'develop' into wip-async-tr-fa commit0810799e25Author: aska-0096 <haocwang@amd.com> Date: Tue Aug 12 14:25:50 2025 +0000 refactor blockgemm change, isolate to v2; commitfd1eb323afAuthor: aska-0096 <haocwang@amd.com> Date: Tue Aug 12 09:26:13 2025 +0000 clang format commit75f6f6bac4Merge:bcc05eee68e1eb0c1eAuthor: aska-0096 <haocwang@amd.com> Date: Tue Aug 12 09:04:41 2025 +0000 Merge branch 'develop' of https://github.com/ROCm/composable_kernel into wip-async-tr-fa commitbcc05eee62Author: aska-0096 <haocwang@amd.com> Date: Tue Aug 12 08:46:06 2025 +0000 Fix the bug commit96d24497f5Author: aska-0096 <haocwang@amd.com> Date: Tue Aug 12 04:02:41 2025 +0000 fix conflict. disable all v-col instance for fmha fwd commit1716171be4Merge:1c98007904fde1646eAuthor: aska-0096 <haocwang@amd.com> Date: Tue Aug 12 03:52:34 2025 +0000 Merge branch 'develop' of https://github.com/ROCm/composable_kernel into wip-async-tr-fa commit1c98007901Author: aska-0096 <haocwang@amd.com> Date: Tue Aug 12 01:53:31 2025 +0000 clang format commitf43e903b1dMerge:3868ddd70a7badc6ecAuthor: aska-0096 <haocwang@amd.com> Date: Tue Aug 12 01:52:52 2025 +0000 Merge branch 'develop' of https://github.com/ROCm/composable_kernel into wip-async-tr-fa commit3868ddd708Merge:498d234ab191c62967Author: aska-0096 <haocwang@amd.com> Date: Mon Aug 11 15:59:40 2025 +0000 Merge branch 'develop' of https://github.com/ROCm/composable_kernel into wip-async-tr-fa commit498d234ab8Author: aska-0096 <haocwang@amd.com> Date: Mon Aug 11 15:37:37 2025 +0000 change the warp setting for hdim32 fmha fwd commitb86f7786e2Author: aska-0096 <haocwang@amd.com> Date: Mon Aug 11 14:21:09 2025 +0000 tempsave, update the blocksync functions commit7b8052d7caAuthor: aska-0096 <haocwang@amd.com> Date: Sun Aug 10 06:00:51 2025 +0000 fix bug in pki4 commit76cbbb84a2Author: aska-0096 <haocwang@amd.com> Date: Sat Aug 9 03:25:12 2025 +0000 fix bugs in gemm commit8c101ccb88Author: aska-0096 <haocwang@amd.com> Date: Fri Aug 8 18:35:53 2025 +0000 fix bug on non-gfx950 commitefb8549279Author: aska-0096 <haocwang@amd.com> Date: Fri Aug 8 17:53:19 2025 +0000 fix bug commit729e8785fbAuthor: aska-0096 <haocwang@amd.com> Date: Fri Aug 8 15:42:15 2025 +0000 fix bugs commit250dc13c75Author: aska-0096 <haocwang@amd.com> Date: Fri Aug 8 09:31:01 2025 +0000 fix clangformat with 18.1.3 commit106edeecd9Author: aska-0096 <haocwang@amd.com> Date: Fri Aug 8 09:07:40 2025 +0000 remove non-necessary change commit78edd7303bAuthor: aska-0096 <haocwang@amd.com> Date: Fri Aug 8 09:04:02 2025 +0000 bug fix, clang format; commit3b9fb6af38Author: aska-0096 <haocwang@amd.com> Date: Fri Aug 8 08:08:03 2025 +0000 Remove unnecessary changes commit6bb57c2c57Merge:1ecee378dab2602683Author: aska-0096 <haocwang@amd.com> Date: Fri Aug 8 07:50:12 2025 +0000 Merge branch 'develop' of https://github.com/ROCm/composable_kernel into wip-async-tr-fa commit1ecee378d5Author: aska-0096 <haocwang@amd.com> Date: Fri Aug 8 06:19:31 2025 +0000 remove unnecessary files; rename some files commitb4640a9de6Author: aska-0096 <haocwang@amd.com> Date: Fri Aug 8 05:46:18 2025 +0000 merge fa_decode pipeline into fmha_fwd api commitfe63a646a4Author: aska-0096 <haocwang@amd.com> Date: Wed Aug 6 05:58:43 2025 +0000 add __restrict__ to tr load commit414cad667bAuthor: aska-0096 <haocwang@amd.com> Date: Tue Aug 5 07:23:51 2025 +0000 Add XOR fold strategy for hdim<128, but perf dropped; disable it by default; wait further perf debug commit0d12fc944fAuthor: aska-0096 <haocwang@amd.com> Date: Mon Aug 4 10:27:42 2025 +0000 Add v_permlaneb32 for block_reduce. Disable it as it will cause un-coexecutable packed math in FA commit4f31847de1Author: aska-0096 <haocwang@amd.com> Date: Mon Aug 4 10:02:17 2025 +0000 add vmcnt guard before load ktile commit746f4ccb99Author: aska-0096 <haocwang@amd.com> Date: Mon Aug 4 06:49:01 2025 +0000 Load Q through lds, implement xor; commit2d4e73d2b4Author: aska-0096 <haocwang@amd.com> Date: Fri Aug 1 10:44:54 2025 +0000 small refactor commita28b6e67feAuthor: aska-0096 <haocwang@amd.com> Date: Thu Jul 31 10:25:37 2025 +0000 upgrade prefill pipeline; simple iglp; consistent data produce and consume order commit75cba48682Author: aska-0096 <haocwang@amd.com> Date: Thu Jul 31 05:13:27 2025 +0000 enable larger tile size; upgrade xor pattern commit69890afc98Author: aska-0096 <haocwang@amd.com> Date: Wed Jul 30 12:25:33 2025 +0000 remove all lds bankconflict with xor layouts commit8dacc35c4cAuthor: aska-0096 <haocwang@amd.com> Date: Wed Jul 30 03:51:06 2025 +0000 enable prefill overload operator(). commit13bcc913deAuthor: aska-0096 <haocwang@amd.com> Date: Fri Jul 25 07:10:01 2025 +0000 fix the lds alignment caused performance regression commitaf28123cecAuthor: aska-0096 <haocwang@amd.com> Date: Wed Jul 23 09:05:57 2025 +0000 remove unnecessary features commit14e0ab70c6Author: aska-0096 <haocwang@amd.com> Date: Tue Jul 22 08:04:05 2025 +0000 tempsave. asynccopy+trload sanity checked commit1b468bac0bAuthor: aska-0096 <haocwang@amd.com> Date: Mon Jul 21 05:55:55 2025 +0000 tempsave, trload+asyncload done commitafd96d8180Author: aska-0096 <haocwang@amd.com> Date: Fri Jul 18 10:04:34 2025 +0000 compile pass commit5616551115Merge:ae39c84f5095393276Author: aska-0096 <haocwang@amd.com> Date: Fri Jul 18 05:17:27 2025 +0000 Merge branch 'develop' of https://github.com/ROCm/composable_kernel into wip-async-tr-fa commitae39c84f55Author: aska-0096 <haocwang@amd.com> Date: Fri Jul 18 05:16:39 2025 +0000 tempsave commit94b6430489Author: aska-0096 <haocwang@amd.com> Date: Thu Jul 17 10:06:09 2025 +0000 temp save commit7e330553dcMerge:18669925c804f77dceAuthor: aska-0096 <haocwang@amd.com> Date: Thu Jul 17 07:24:32 2025 +0000 Merge branch 'test_copy_fix' of https://github.com/ROCm/composable_kernel into fa_decode_pipeline commit804f77dce5Author: aska-0096 <haocwang@amd.com> Date: Thu Jul 17 03:10:46 2025 +0000 move test_copy into test commit21627d7ca7Author: aska-0096 <haocwang@amd.com> Date: Thu Jul 17 02:41:31 2025 +0000 remove unnecessary output commit287792c44aMerge:a4221db3021fd7e953Author: aska-0096 <haocwang@amd.com> Date: Thu Jul 17 02:26:13 2025 +0000 Merge branch 'test_copy_fix' of https://github.com/ROCm/composable_kernel into test_copy_fix commita4221db304Author: aska-0096 <haocwang@amd.com> Date: Thu Jul 17 02:26:10 2025 +0000 add input validation and bug fix commit21fd7e9538Merge:d6df7bf856e76b8205Author: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Date: Wed Jul 16 11:23:57 2025 -0700 Merge branch 'develop' into test_copy_fix commitd6df7bf851Author: aska-0096 <haocwang@amd.com> Date: Wed Jul 16 08:55:50 2025 +0000 fix vmcnt shift commit40e039e4e4Author: aska-0096 <haocwang@amd.com> Date: Wed Jul 16 08:37:07 2025 +0000 Improve s_waitcnt_imm calculation commitc30f8b709bAuthor: aska-0096 <haocwang@amd.com> Date: Wed Jul 16 05:39:50 2025 +0000 fix the s_waitcnt_imm calculation commitec0a45b29fMerge:e5cc4af806b09f0823Author: aska-0096 <haocwang@amd.com> Date: Wed Jul 16 03:57:57 2025 +0000 Merge branch 'develop' of https://github.com/ROCm/composable_kernel into test_copy_fix commite5cc4af808Author: aska-0096 <haocwang@amd.com> Date: Wed Jul 16 03:54:33 2025 +0000 Add block_sync_lds_direct_load utility commiteea58629cfAuthor: aska-0096 <haocwang@amd.com> Date: Tue Jul 15 09:39:03 2025 +0000 fix async copytest bug commit18669925ccAuthor: aska-0096 <haocwang@amd.com> Date: Thu Jul 10 04:29:33 2025 +0000 temp save, change all instance to 1wave commit18686cfe5bAuthor: aska-0096 <haocwang@amd.com> Date: Tue Jul 8 08:37:20 2025 +0000 tempsave, fmha_decode commit47565f21a5Author: aska-0096 <haocwang@amd.com> Date: Sat Jun 21 15:02:57 2025 +0000 temp save, waiting for debug commite0a634ef97Author: aska-0096 <haocwang@amd.com> Date: Thu Jun 19 05:11:52 2025 +0000 save an example for __bf16 type commit4bd5fd4a3cAuthor: aska-0096 <haocwang@amd.com> Date: Wed Jun 18 07:27:24 2025 +0000 fix bwd code commit69809d9513Author: aska-0096 <haocwang@amd.com> Date: Wed Jun 18 06:37:16 2025 +0000 Fix for fwd/bwd kernel build filter commit d5ec3d0e5768aafed7f77151b2a835e87b9f95ba Author: Ding, Yi <yi.ding@amd.com> Date: Tue Aug 19 08:13:18 2025 +0000 Add restrict to avoid unnecessary vmcnt --------- Co-authored-by: aska-0096 <haocwang@amd.com> * Add comments for c-stype cast * Better comments --------- Co-authored-by: aska-0096 <haocwang@amd.com>
This commit is contained in:
@@ -1833,14 +1833,17 @@ CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem,
|
||||
if constexpr(oob_conditional_check)
|
||||
v_offset = flag ? v_offset : src_wave_buffer_resource[2];
|
||||
|
||||
llvm_amdgcn_raw_buffer_load_lds(
|
||||
src_wave_buffer_resource,
|
||||
reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(smem)),
|
||||
bytes,
|
||||
v_offset,
|
||||
src_wave_addr_offset,
|
||||
/*src_immediate_addr_offset*/ 0,
|
||||
static_cast<index_t>(coherence));
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wold-style-cast"
|
||||
// Use C-style cast to change address space without dropping llvm noalias attribute
|
||||
llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
|
||||
(as3_uint32_ptr)(smem),
|
||||
bytes,
|
||||
v_offset,
|
||||
src_wave_addr_offset,
|
||||
/*src_immediate_addr_offset*/ 0,
|
||||
static_cast<index_t>(coherence));
|
||||
#pragma clang diagnostic pop
|
||||
}
|
||||
|
||||
template <index_t N,
|
||||
@@ -2788,23 +2791,26 @@ CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer<T, N>& src_thread_
|
||||
template <typename T, index_t N, address_space_enum BufferAddressSpace>
|
||||
__device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
|
||||
{
|
||||
#define __LDS_ADDR __attribute__((address_space(3)))
|
||||
|
||||
static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32),
|
||||
"We need to have the compatible compiler version to build this instruction");
|
||||
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wold-style-cast"
|
||||
// Use C-style cast to change address space without dropping llvm noalias attribute
|
||||
const auto in_ptr_ = (__LDS_ADDR T*)(const_cast<T*>(in_ptr));
|
||||
#pragma clang diagnostic pop
|
||||
if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::half_t>)
|
||||
{
|
||||
typedef __attribute__((__vector_size__(4 * sizeof(__fp16)))) __fp16 llvm_fp16x4_t;
|
||||
__attribute__((address_space(3))) llvm_fp16x4_t* lds_ptr =
|
||||
reinterpret_cast<__attribute__((address_space(3))) llvm_fp16x4_t*>(
|
||||
reinterpret_cast<uintptr_t>(in_ptr));
|
||||
auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_fp16x4_t*>(in_ptr_);
|
||||
return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr16_b64_v4f16(lds_ptr));
|
||||
}
|
||||
else if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::bf16_t>)
|
||||
{
|
||||
typedef __attribute__((__vector_size__(4 * sizeof(__bf16)))) __bf16 llvm_bf16x4_t;
|
||||
__attribute__((address_space(3))) llvm_bf16x4_t* lds_ptr =
|
||||
reinterpret_cast<__attribute__((address_space(3))) llvm_bf16x4_t*>(
|
||||
reinterpret_cast<uintptr_t>(in_ptr));
|
||||
auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_bf16x4_t*>(in_ptr_);
|
||||
return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr16_b64_v4bf16(lds_ptr));
|
||||
}
|
||||
else if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::fp8_t> ||
|
||||
@@ -2812,15 +2818,14 @@ __device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
|
||||
std::is_same_v<remove_cvref_t<T>, ck_tile::int8_t>)
|
||||
{
|
||||
typedef __attribute__((__vector_size__(2 * sizeof(index_t)))) index_t llvm_i32x2_t;
|
||||
__attribute__((address_space(3))) llvm_i32x2_t* lds_ptr =
|
||||
reinterpret_cast<__attribute__((address_space(3))) llvm_i32x2_t*>(
|
||||
reinterpret_cast<uintptr_t>(in_ptr));
|
||||
auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_i32x2_t*>(in_ptr_);
|
||||
return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr8_b64_v2i32(lds_ptr));
|
||||
}
|
||||
else
|
||||
{
|
||||
static_assert(false, "not implemented");
|
||||
}
|
||||
#undef __LDS_ADDR
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -1603,14 +1603,17 @@ CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem,
|
||||
if constexpr(oob_conditional_check)
|
||||
v_offset = flag ? v_offset : src_wave_buffer_resource[2];
|
||||
|
||||
llvm_amdgcn_raw_buffer_load_lds(
|
||||
src_wave_buffer_resource,
|
||||
reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(smem)),
|
||||
bytes,
|
||||
v_offset,
|
||||
src_wave_addr_offset,
|
||||
/*src_immediate_addr_offset*/ 0,
|
||||
static_cast<index_t>(coherence));
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wold-style-cast"
|
||||
// Use C-style cast to change address space without dropping llvm noalias attribute
|
||||
llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
|
||||
(as3_uint32_ptr)(smem),
|
||||
bytes,
|
||||
v_offset,
|
||||
src_wave_addr_offset,
|
||||
/*src_immediate_addr_offset*/ 0,
|
||||
static_cast<index_t>(coherence));
|
||||
#pragma clang diagnostic pop
|
||||
}
|
||||
|
||||
template <index_t N,
|
||||
@@ -2606,23 +2609,26 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
|
||||
template <typename T, index_t N, address_space_enum BufferAddressSpace>
|
||||
__device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
|
||||
{
|
||||
#define __LDS_ADDR __attribute__((address_space(3)))
|
||||
|
||||
static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32),
|
||||
"We need to have the compatible compiler version to build this instruction");
|
||||
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wold-style-cast"
|
||||
// Use C-style cast to change address space without dropping llvm noalias attribute
|
||||
const auto in_ptr_ = (__LDS_ADDR T*)(const_cast<T*>(in_ptr));
|
||||
#pragma clang diagnostic pop
|
||||
if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::half_t>)
|
||||
{
|
||||
typedef __attribute__((__vector_size__(4 * sizeof(__fp16)))) __fp16 llvm_fp16x4_t;
|
||||
__attribute__((address_space(3))) llvm_fp16x4_t* lds_ptr =
|
||||
reinterpret_cast<__attribute__((address_space(3))) llvm_fp16x4_t*>(
|
||||
reinterpret_cast<uintptr_t>(in_ptr));
|
||||
auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_fp16x4_t*>(in_ptr_);
|
||||
return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr16_b64_v4f16(lds_ptr));
|
||||
}
|
||||
else if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::bf16_t>)
|
||||
{
|
||||
typedef __attribute__((__vector_size__(4 * sizeof(__bf16)))) __bf16 llvm_bf16x4_t;
|
||||
__attribute__((address_space(3))) llvm_bf16x4_t* lds_ptr =
|
||||
reinterpret_cast<__attribute__((address_space(3))) llvm_bf16x4_t*>(
|
||||
reinterpret_cast<uintptr_t>(in_ptr));
|
||||
auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_bf16x4_t*>(in_ptr_);
|
||||
return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr16_b64_v4bf16(lds_ptr));
|
||||
}
|
||||
else if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::fp8_t> ||
|
||||
@@ -2630,15 +2636,14 @@ __device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
|
||||
std::is_same_v<remove_cvref_t<T>, ck_tile::int8_t>)
|
||||
{
|
||||
typedef __attribute__((__vector_size__(2 * sizeof(index_t)))) index_t llvm_i32x2_t;
|
||||
__attribute__((address_space(3))) llvm_i32x2_t* lds_ptr =
|
||||
reinterpret_cast<__attribute__((address_space(3))) llvm_i32x2_t*>(
|
||||
reinterpret_cast<uintptr_t>(in_ptr));
|
||||
auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_i32x2_t*>(in_ptr_);
|
||||
return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr8_b64_v2i32(lds_ptr));
|
||||
}
|
||||
else
|
||||
{
|
||||
static_assert(false, "not implemented");
|
||||
}
|
||||
#undef __LDS_ADDR
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
Reference in New Issue
Block a user