mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-15 18:42:06 +00:00
* Add basic support for direct loads from global to LDS
* Clean the code and comments
* Add support for fp16
* Add comments
* Add check for thread cluster lengths
* Align non-direct-load fp16 example
* Small fixes
* Extend IsSupported to check for supported GPU gens
* Build examples only on the supported HW
* Do not throw when instance not supported in 04 example
* Review: Apply review suggestions
* Review: small fix
* Review: small fix
[ROCm/composable_kernel commit: 627054b941]
43 lines
673 B
C++
43 lines
673 B
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include "ck/ck.hpp"
|
|
|
|
namespace ck {
|
|
|
|
__device__ void block_sync_lds()
|
|
{
|
|
#if CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
|
|
asm volatile("\
|
|
s_waitcnt lgkmcnt(0) \n \
|
|
s_barrier \
|
|
" ::);
|
|
#else
|
|
__syncthreads();
|
|
#endif
|
|
}
|
|
|
|
__device__ void block_sync_lds_direct_load()
|
|
{
|
|
asm volatile("\
|
|
s_waitcnt vmcnt(0) \n \
|
|
s_waitcnt lgkmcnt(0) \n \
|
|
s_barrier \
|
|
" ::);
|
|
}
|
|
|
|
__device__ void s_nop()
|
|
{
|
|
#if 1
|
|
asm volatile("\
|
|
s_nop 0 \n \
|
|
" ::);
|
|
#else
|
|
__builtin_amdgcn_sched_barrier(0);
|
|
#endif
|
|
}
|
|
|
|
} // namespace ck
|