mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-04 05:31:24 +00:00
Update AMD buffer coherency (#3403)
* Update AMD buffer coherency [AICK-421] * fixes * fix * fixes * fixes * Add backward compatilibity * fix * fixes * fix * fix * fix * Update grouped_convolution_backward_weight_kernel.hpp
This commit is contained in:
@@ -16,6 +16,7 @@
|
||||
#include "ck_tile/core/utility/bit_cast.hpp"
|
||||
#include "ck_tile/core/utility/functional.hpp"
|
||||
#include "ck_tile/core/utility/ignore.hpp"
|
||||
#include "ck_tile/core/arch/amd_buffer_coherence.hpp"
|
||||
|
||||
// This attribute gives a hint to the compiler that a branch is likely to be taken.
|
||||
// Then, the compiler should remove if possible the associated s_cbranch_execz branch that would
|
||||
@@ -1409,30 +1410,6 @@ CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
|
||||
asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
|
||||
}
|
||||
|
||||
// memory coherency bit for buffer store/load instruction
|
||||
// check ISA manual for each GFX target
|
||||
// e.g. for
|
||||
// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
|
||||
// page 67~68
|
||||
enum struct amd_buffer_coherence_enum
|
||||
{
|
||||
coherence_default = 0, // default value
|
||||
glc = 1,
|
||||
slc = 2,
|
||||
glc_slc = 3,
|
||||
// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
|
||||
// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
|
||||
// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
|
||||
WAVE_NT0 = 0,
|
||||
WAVE_NT1 = 2,
|
||||
GROUP_NT0 = 1,
|
||||
GROUP_NT1 = 3,
|
||||
DEVICE_NT0 = 16,
|
||||
DEVICE_NT1 = 18,
|
||||
SYSTEM_NT0 = 17,
|
||||
SYSTEM_NT1 = 19,
|
||||
};
|
||||
|
||||
template <index_t N,
|
||||
amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
|
||||
CK_TILE_DEVICE thread_buffer<int8_t, N>
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "ck_tile/core/utility/bit_cast.hpp"
|
||||
#include "ck_tile/core/utility/functional.hpp"
|
||||
#include "ck_tile/core/utility/ignore.hpp"
|
||||
#include "ck_tile/core/arch/amd_buffer_coherence.hpp"
|
||||
|
||||
using as3_uint32_ptr = uint32_t __attribute__((address_space(3)))*;
|
||||
|
||||
@@ -1277,30 +1278,6 @@ CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
|
||||
asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
|
||||
}
|
||||
|
||||
// memory coherency bit for buffer store/load instruction
|
||||
// check ISA manual for each GFX target
|
||||
// e.g. for
|
||||
// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
|
||||
// page 67~68
|
||||
enum struct amd_buffer_coherence_enum
|
||||
{
|
||||
coherence_default = 0, // default value
|
||||
glc = 1,
|
||||
slc = 2,
|
||||
glc_slc = 3,
|
||||
// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
|
||||
// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
|
||||
// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
|
||||
WAVE_NT0 = 0,
|
||||
WAVE_NT1 = 2,
|
||||
GROUP_NT0 = 1,
|
||||
GROUP_NT1 = 3,
|
||||
DEVICE_NT0 = 16,
|
||||
DEVICE_NT1 = 18,
|
||||
SYSTEM_NT0 = 17,
|
||||
SYSTEM_NT1 = 19,
|
||||
};
|
||||
|
||||
template <index_t N,
|
||||
amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
|
||||
CK_TILE_DEVICE thread_buffer<int8_t, N>
|
||||
|
||||
124
include/ck_tile/core/arch/amd_buffer_coherence.hpp
Normal file
124
include/ck_tile/core/arch/amd_buffer_coherence.hpp
Normal file
@@ -0,0 +1,124 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
// memory coherency bit for buffer store/load instruction
|
||||
// check ISA manual for each GFX target
|
||||
// e.g. for
|
||||
// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
|
||||
// page 67~68
|
||||
enum struct amd_buffer_coherence_enum
|
||||
{
|
||||
coherence_default = 0, // default value
|
||||
#if defined(__gfx12__)
|
||||
// Temporal hint
|
||||
RT = 0, // regular temporal
|
||||
NT = 1, // non temporal
|
||||
HT = 2, // high priority temporal
|
||||
LU = 3, // last use (load op)
|
||||
WB = 3, // same as HT, overrides WR in far cache (store op)
|
||||
NT_RT = 4, // non temporal for near cache, regular for far cache
|
||||
RT_NT = 5, // regular for near cache, non-temporal for far cache
|
||||
NT_HT = 6, // non temporal for near cache, high priority for far cache
|
||||
NT_WB = 7, // non temporal for near cache, WB for far cache
|
||||
// (store op, reserved for load op)
|
||||
// Scope
|
||||
CU = 0,
|
||||
SE = 8,
|
||||
DEVICE = 16,
|
||||
SYSTEM = 24,
|
||||
// Temporal Hint for CU
|
||||
CU_RT = RT | CU,
|
||||
CU_NT = NT | CU,
|
||||
CU_HT = HT | CU,
|
||||
CU_LU = LU | CU,
|
||||
CU_WB = WB | CU,
|
||||
CU_NT_RT = NT_RT | CU,
|
||||
CU_RT_NT = RT_NT | CU,
|
||||
CU_NT_HT = NT_HT | CU,
|
||||
CU_NT_WB = NT_WB | CU,
|
||||
// Temporal Hint for SE
|
||||
SE_RT = RT | SE,
|
||||
SE_NT = NT | SE,
|
||||
SE_HT = HT | SE,
|
||||
SE_LU = LU | SE,
|
||||
SE_WB = WB | SE,
|
||||
SE_NT_RT = NT_RT | SE,
|
||||
SE_RT_NT = RT_NT | SE,
|
||||
SE_NT_HT = NT_HT | SE,
|
||||
SE_NT_WB = NT_WB | SE,
|
||||
// Temporal Hint for DEVICE
|
||||
DEVICE_RT = RT | DEVICE,
|
||||
DEVICE_NT = NT | DEVICE,
|
||||
DEVICE_HT = HT | DEVICE,
|
||||
DEVICE_LU = LU | DEVICE,
|
||||
DEVICE_WB = WB | DEVICE,
|
||||
DEVICE_NT_RT = NT_RT | DEVICE,
|
||||
DEVICE_RT_NT = RT_NT | DEVICE,
|
||||
DEVICE_NT_HT = NT_HT | DEVICE,
|
||||
DEVICE_NT_WB = NT_WB | DEVICE,
|
||||
// Temporal Hint for SYSTEM
|
||||
SYSTEM_RT = RT | SYSTEM,
|
||||
SYSTEM_NT = NT | SYSTEM,
|
||||
SYSTEM_HT = HT | SYSTEM,
|
||||
SYSTEM_LU = LU | SYSTEM,
|
||||
SYSTEM_WB = WB | SYSTEM,
|
||||
SYSTEM_NT_RT = NT_RT | SYSTEM,
|
||||
SYSTEM_RT_NT = RT_NT | SYSTEM,
|
||||
SYSTEM_NT_HT = NT_HT | SYSTEM,
|
||||
SYSTEM_NT_WB = NT_WB | SYSTEM,
|
||||
|
||||
// GFX942 and GFX950 compatiblity
|
||||
GROUP_NT0 = CU_RT,
|
||||
GROUP_NT1 = CU_NT,
|
||||
DEVICE_NT0 = DEVICE_RT,
|
||||
DEVICE_NT1 = DEVICE_NT,
|
||||
SYSTEM_NT0 = SYSTEM_RT,
|
||||
SYSTEM_NT1 = SYSTEM_NT,
|
||||
// Other archs compatiblity
|
||||
glc = DEVICE_NT,
|
||||
slc = SYSTEM_NT,
|
||||
glc_slc = DEVICE_NT | SYSTEM_NT,
|
||||
|
||||
// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
|
||||
// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
|
||||
// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
|
||||
#elif defined(__gfx942__) || defined(__gfx950__)
|
||||
|
||||
WAVE = 0,
|
||||
GROUP = 1,
|
||||
DEVICE = 16,
|
||||
SYSTEM = 17,
|
||||
NT0 = 0,
|
||||
NT1 = 2,
|
||||
|
||||
WAVE_NT0 = NT0 | WAVE,
|
||||
WAVE_NT1 = NT1 | WAVE,
|
||||
GROUP_NT0 = NT0 | GROUP,
|
||||
GROUP_NT1 = NT1 | GROUP,
|
||||
DEVICE_NT0 = NT0 | DEVICE,
|
||||
DEVICE_NT1 = NT1 | DEVICE,
|
||||
SYSTEM_NT0 = NT0 | SYSTEM,
|
||||
SYSTEM_NT1 = NT1 | SYSTEM,
|
||||
|
||||
// Other archs compatiblity
|
||||
glc = DEVICE_NT1,
|
||||
slc = SYSTEM_NT1,
|
||||
glc_slc = DEVICE_NT1 | SYSTEM_NT1,
|
||||
#else
|
||||
glc = 1,
|
||||
slc = 2,
|
||||
glc_slc = 3,
|
||||
|
||||
// Other archs compatiblity
|
||||
DEVICE_NT0 = 0,
|
||||
SYSTEM_NT0 = 0,
|
||||
DEVICE_NT1 = glc,
|
||||
SYSTEM_NT1 = slc,
|
||||
#endif
|
||||
};
|
||||
|
||||
} // namespace ck_tile
|
||||
Reference in New Issue
Block a user