mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 08:50:17 +00:00
* Add basic support for direct loads from global to LDS * Clean the code and comments * Add support for fp16 * Add comments * Add check for thread cluster lengths * Align non-direct-load fp16 example * Small fixes * Extend IsSupported to check for supported GPU gens * Build examples only on the supported HW * Do not throw when instance not supported in 04 example * Review: Apply review suggestions * Review: small fix * Review: small fix
43 lines
673 B
C++
43 lines
673 B
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include "ck/ck.hpp"
|
|
|
|
namespace ck {
|
|
|
|
__device__ void block_sync_lds()
|
|
{
|
|
#if CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
|
|
asm volatile("\
|
|
s_waitcnt lgkmcnt(0) \n \
|
|
s_barrier \
|
|
" ::);
|
|
#else
|
|
__syncthreads();
|
|
#endif
|
|
}
|
|
|
|
__device__ void block_sync_lds_direct_load()
|
|
{
|
|
asm volatile("\
|
|
s_waitcnt vmcnt(0) \n \
|
|
s_waitcnt lgkmcnt(0) \n \
|
|
s_barrier \
|
|
" ::);
|
|
}
|
|
|
|
__device__ void s_nop()
|
|
{
|
|
#if 1
|
|
asm volatile("\
|
|
s_nop 0 \n \
|
|
" ::);
|
|
#else
|
|
__builtin_amdgcn_sched_barrier(0);
|
|
#endif
|
|
}
|
|
|
|
} // namespace ck
|