mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-16 02:54:21 +00:00
* Initial implementation:
- add new thread group transfer supporting transpose instruction
- refactor AB transfer to switch between thread and wave tiles methods
* Add some comments and remove explicit wave and lane calculations
* Remove compiler option for performance
* fp16 example: use tuned instance
* Missing cleanup
* Integrate wave transfer in existing gemm and batched gemm instances
* Add fast instances
* extend implementation for 8 bit datatypes
packed types not supported
* Address review comments
* Optimize pipeline v1 and re-introduce compiler option
* Disable wave tile approach for b scale gemm
* Fix for clang20
* Avoid code duplication of amd_global_load_transpose_to_vgpr function
[ROCm/composable_kernel commit: 440358c168]
66 lines
1.2 KiB
C++
66 lines
1.2 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include "ck/ck.hpp"
|
|
|
|
namespace ck {
|
|
|
|
#if CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
|
|
#ifdef __gfx12__
|
|
__device__ void llvm_amdgcn_s_wait_dscnt(short cnt) __asm("llvm.amdgcn.s.wait.dscnt");
|
|
#endif
|
|
#endif
|
|
|
|
__device__ void block_sync_lds()
|
|
{
|
|
#if CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
|
|
#ifdef __gfx12__
|
|
llvm_amdgcn_s_wait_dscnt(0);
|
|
asm volatile("s_barrier_signal -1\n\t"
|
|
"s_barrier_wait -1");
|
|
#else
|
|
// asm volatile("\
|
|
// s_waitcnt lgkmcnt(0) \n \
|
|
// s_barrier \
|
|
// " ::);
|
|
__builtin_amdgcn_s_waitcnt(0xc07f);
|
|
__builtin_amdgcn_s_barrier();
|
|
#endif
|
|
#else
|
|
__syncthreads();
|
|
#endif
|
|
}
|
|
|
|
__device__ void block_sync_lds_direct_load()
|
|
{
|
|
#ifdef __gfx12__
|
|
asm volatile("\
|
|
s_wait_loadcnt 0x0 \n \
|
|
s_wait_dscnt 0x0 \n \
|
|
s_barrier_signal -1 \n \
|
|
s_barrier_wait -1 \
|
|
" ::);
|
|
#else
|
|
asm volatile("\
|
|
s_waitcnt vmcnt(0) \n \
|
|
s_waitcnt lgkmcnt(0) \n \
|
|
s_barrier \
|
|
" ::);
|
|
#endif
|
|
}
|
|
|
|
__device__ void s_nop()
|
|
{
|
|
#if 1
|
|
asm volatile("\
|
|
s_nop 0 \n \
|
|
" ::);
|
|
#else
|
|
__builtin_amdgcn_sched_barrier(0);
|
|
#endif
|
|
}
|
|
|
|
} // namespace ck
|