mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-11 00:39:02 +00:00
[CK] add composable kernel support on gfx1250 (#6978) ## Motivation Add composable kernel support on gfx1250. ## Technical Details <!-- Explain the changes along with any relevant GitHub links. --> ## Test Plan <!-- Explain any relevant testing done to verify this PR. --> ## Test Result <!-- Briefly summarize test outcomes. --> ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. --------- Co-authored-by: Qun Lin <qlin@amd.com> Co-authored-by: jialuo12_amdeng <jia.luo@amd.com> Co-authored-by: Andriy Roshchenko <andriy.roshchenko@amd.com> Co-authored-by: hsivasun_amdeng <haresh.sivasuntharampillai@amd.com>
88 lines
1.7 KiB
C++
88 lines
1.7 KiB
C++
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
#pragma once
|
|
|
|
#include "ck/ck.hpp"
|
|
|
|
namespace ck {
|
|
|
|
#if CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
|
|
#ifdef __gfx12__
|
|
__device__ void llvm_amdgcn_s_wait_dscnt(short cnt) __asm("llvm.amdgcn.s.wait.dscnt");
|
|
#endif
|
|
#endif
|
|
|
|
__device__ void block_sync_lds()
|
|
{
|
|
#if CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
|
|
#if defined(__gfx12__)
|
|
llvm_amdgcn_s_wait_dscnt(0);
|
|
__builtin_amdgcn_s_barrier_signal(-1);
|
|
__builtin_amdgcn_s_barrier_wait(-1);
|
|
#elif defined(__gfx11__)
|
|
// asm volatile("\
|
|
// s_waitcnt lgkmcnt(0) \n \
|
|
// s_barrier \
|
|
// " ::);
|
|
__builtin_amdgcn_s_waitcnt(0xfc07);
|
|
__builtin_amdgcn_s_barrier();
|
|
#else
|
|
// asm volatile("\
|
|
// s_waitcnt lgkmcnt(0) \n \
|
|
// s_barrier \
|
|
// " ::);
|
|
__builtin_amdgcn_s_waitcnt(0xc07f);
|
|
__builtin_amdgcn_s_barrier();
|
|
#endif
|
|
#else
|
|
__syncthreads();
|
|
#endif
|
|
}
|
|
|
|
__device__ void block_sync_lds_direct_load()
|
|
{
|
|
#if defined(__gfx125__)
|
|
__builtin_amdgcn_s_wait_asynccnt(0);
|
|
__builtin_amdgcn_s_barrier_signal(-1);
|
|
__builtin_amdgcn_s_barrier_wait(-1);
|
|
#elif defined(__gfx12__)
|
|
asm volatile("\
|
|
s_wait_loadcnt 0x0 \n \
|
|
s_wait_dscnt 0x0 \n \
|
|
s_barrier_signal -1 \n \
|
|
s_barrier_wait -1 \
|
|
" ::);
|
|
#else
|
|
asm volatile("\
|
|
s_waitcnt vmcnt(0) \n \
|
|
s_waitcnt lgkmcnt(0) \n \
|
|
s_barrier \
|
|
" ::);
|
|
#endif
|
|
}
|
|
|
|
__device__ void block_sync_lds_async_load()
|
|
{
|
|
#if defined(__gfx125__)
|
|
__builtin_amdgcn_s_wait_asynccnt(0);
|
|
__syncthreads();
|
|
#else
|
|
// fall back
|
|
block_sync_lds();
|
|
#endif
|
|
}
|
|
|
|
__device__ void s_nop()
|
|
{
|
|
#if 1
|
|
asm volatile("\
|
|
s_nop 0 \n \
|
|
" ::);
|
|
#else
|
|
__builtin_amdgcn_sched_barrier(0);
|
|
#endif
|
|
}
|
|
|
|
} // namespace ck
|