mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-16 10:59:55 +00:00
* initial stream-k implementation with example
* fix unexpected change in err
* improve a little bit performance by reorganize pipeline.
* improve perf a little bit by swizzle block idx
* add profiler
* update example
* fix spelling
* shrink karg for streamk
* support dynamic buffer using memory coherence glc_slc bit from template
* control memory coherence while construct dynamic buffer
* update reduction for streamk(not ready yet)
* Add template parameter to make_dynamic_buffer to support amd_buffer coherence setting
* fix build issue
* fix several bug
* now result is correct, everything works (but has scratch)
* remove scratch by manually reset coordinate
* update device code
* fix a bug in final reduce
* fix something in example
* update async memset
* fix enum as camel case
* modify coherence enum name
* clean code and use atomic streamk by default
* remove unused var
* throw exception if have empty pointer
* fix format
* fix CI warning
* fix type in init
* modify CI error
* filter out on gfx10+
* restore changed example code
---------
Co-authored-by: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
[ROCm/composable_kernel commit: e7dca79d27]
74 lines
1.8 KiB
C++
74 lines
1.8 KiB
C++
#pragma once
|
|
#include <hip/hip_runtime.h>
|
|
#include <stdint.h>
|
|
|
|
namespace ck {
|
|
struct workgroup_barrier
|
|
{
|
|
__device__ workgroup_barrier(uint32_t* ptr) : base_ptr(ptr) {}
|
|
|
|
__device__ uint32_t ld(uint32_t offset)
|
|
{
|
|
#if 0
|
|
float d = llvm_amdgcn_raw_buffer_load_fp32(
|
|
amdgcn_make_buffer_resource(base_ptr),
|
|
0,
|
|
offset,
|
|
AMDGCN_BUFFER_GLC);
|
|
union cvt {
|
|
float f32;
|
|
uint32_t u32;
|
|
};
|
|
cvt x;
|
|
x.f32 = d;
|
|
return x.u32;
|
|
#endif
|
|
return __atomic_load_n(base_ptr + offset, __ATOMIC_RELAXED);
|
|
}
|
|
|
|
__device__ void wait_eq(uint32_t offset, uint32_t value)
|
|
{
|
|
if(threadIdx.x == 0)
|
|
{
|
|
while(ld(offset) != value) {}
|
|
}
|
|
__syncthreads();
|
|
}
|
|
|
|
__device__ void wait_lt(uint32_t offset, uint32_t value)
|
|
{
|
|
if(threadIdx.x == 0)
|
|
{
|
|
while(ld(offset) < value) {}
|
|
}
|
|
__syncthreads();
|
|
}
|
|
|
|
__device__ void wait_set(uint32_t offset, uint32_t compare, uint32_t value)
|
|
{
|
|
if(threadIdx.x == 0)
|
|
{
|
|
while(atomicCAS(base_ptr + offset, compare, value) != compare) {}
|
|
}
|
|
__syncthreads();
|
|
}
|
|
|
|
// enter critical zoon, assume buffer is zero when launch kernel
|
|
__device__ void aquire(uint32_t offset) { wait_set(offset, 0, 1); }
|
|
|
|
// exit critical zoon, assume buffer is zero when launch kernel
|
|
__device__ void release(uint32_t offset) { wait_set(offset, 1, 0); }
|
|
|
|
__device__ void inc(uint32_t offset)
|
|
{
|
|
__syncthreads();
|
|
if(threadIdx.x == 0)
|
|
{
|
|
atomicAdd(base_ptr + offset, 1);
|
|
}
|
|
}
|
|
|
|
uint32_t* base_ptr;
|
|
};
|
|
} // namespace ck
|