Update AMD buffer coherency (#3403)

* Update AMD buffer coherency [AICK-421]

* fixes

* fix

* fixes

* fixes

* Add backward compatilibity

* fix

* fixes

* fix

* fix

* fix

* Update grouped_convolution_backward_weight_kernel.hpp
This commit is contained in:
Bartłomiej Kocot
2025-12-18 10:16:22 +01:00
committed by GitHub
parent 15e81397a4
commit 700b2ec9c0
11 changed files with 268 additions and 98 deletions

View File

@@ -16,6 +16,7 @@
#include "ck_tile/core/utility/bit_cast.hpp"
#include "ck_tile/core/utility/functional.hpp"
#include "ck_tile/core/utility/ignore.hpp"
#include "ck_tile/core/arch/amd_buffer_coherence.hpp"
// This attribute gives a hint to the compiler that a branch is likely to be taken.
// Then, the compiler should remove if possible the associated s_cbranch_execz branch that would
@@ -1409,30 +1410,6 @@ CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
}
// memory coherency bit for buffer store/load instruction
// check ISA manual for each GFX target
// e.g. for
// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
// page 67~68
enum struct amd_buffer_coherence_enum
{
coherence_default = 0, // default value
glc = 1,
slc = 2,
glc_slc = 3,
// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
WAVE_NT0 = 0,
WAVE_NT1 = 2,
GROUP_NT0 = 1,
GROUP_NT1 = 3,
DEVICE_NT0 = 16,
DEVICE_NT1 = 18,
SYSTEM_NT0 = 17,
SYSTEM_NT1 = 19,
};
template <index_t N,
amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE thread_buffer<int8_t, N>

View File

@@ -16,6 +16,7 @@
#include "ck_tile/core/utility/bit_cast.hpp"
#include "ck_tile/core/utility/functional.hpp"
#include "ck_tile/core/utility/ignore.hpp"
#include "ck_tile/core/arch/amd_buffer_coherence.hpp"
using as3_uint32_ptr = uint32_t __attribute__((address_space(3)))*;
@@ -1277,30 +1278,6 @@ CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
}
// memory coherency bit for buffer store/load instruction
// check ISA manual for each GFX target
// e.g. for
// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
// page 67~68
enum struct amd_buffer_coherence_enum
{
coherence_default = 0, // default value
glc = 1,
slc = 2,
glc_slc = 3,
// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
WAVE_NT0 = 0,
WAVE_NT1 = 2,
GROUP_NT0 = 1,
GROUP_NT1 = 3,
DEVICE_NT0 = 16,
DEVICE_NT1 = 18,
SYSTEM_NT0 = 17,
SYSTEM_NT1 = 19,
};
template <index_t N,
amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE thread_buffer<int8_t, N>

View File

@@ -0,0 +1,124 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
namespace ck_tile {
// memory coherency bit for buffer store/load instruction
// check ISA manual for each GFX target
// e.g. for
// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
// page 67~68
enum struct amd_buffer_coherence_enum
{
coherence_default = 0, // default value
#if defined(__gfx12__)
// Temporal hint
RT = 0, // regular temporal
NT = 1, // non temporal
HT = 2, // high priority temporal
LU = 3, // last use (load op)
WB = 3, // same as HT, overrides WR in far cache (store op)
NT_RT = 4, // non temporal for near cache, regular for far cache
RT_NT = 5, // regular for near cache, non-temporal for far cache
NT_HT = 6, // non temporal for near cache, high priority for far cache
NT_WB = 7, // non temporal for near cache, WB for far cache
// (store op, reserved for load op)
// Scope
CU = 0,
SE = 8,
DEVICE = 16,
SYSTEM = 24,
// Temporal Hint for CU
CU_RT = RT | CU,
CU_NT = NT | CU,
CU_HT = HT | CU,
CU_LU = LU | CU,
CU_WB = WB | CU,
CU_NT_RT = NT_RT | CU,
CU_RT_NT = RT_NT | CU,
CU_NT_HT = NT_HT | CU,
CU_NT_WB = NT_WB | CU,
// Temporal Hint for SE
SE_RT = RT | SE,
SE_NT = NT | SE,
SE_HT = HT | SE,
SE_LU = LU | SE,
SE_WB = WB | SE,
SE_NT_RT = NT_RT | SE,
SE_RT_NT = RT_NT | SE,
SE_NT_HT = NT_HT | SE,
SE_NT_WB = NT_WB | SE,
// Temporal Hint for DEVICE
DEVICE_RT = RT | DEVICE,
DEVICE_NT = NT | DEVICE,
DEVICE_HT = HT | DEVICE,
DEVICE_LU = LU | DEVICE,
DEVICE_WB = WB | DEVICE,
DEVICE_NT_RT = NT_RT | DEVICE,
DEVICE_RT_NT = RT_NT | DEVICE,
DEVICE_NT_HT = NT_HT | DEVICE,
DEVICE_NT_WB = NT_WB | DEVICE,
// Temporal Hint for SYSTEM
SYSTEM_RT = RT | SYSTEM,
SYSTEM_NT = NT | SYSTEM,
SYSTEM_HT = HT | SYSTEM,
SYSTEM_LU = LU | SYSTEM,
SYSTEM_WB = WB | SYSTEM,
SYSTEM_NT_RT = NT_RT | SYSTEM,
SYSTEM_RT_NT = RT_NT | SYSTEM,
SYSTEM_NT_HT = NT_HT | SYSTEM,
SYSTEM_NT_WB = NT_WB | SYSTEM,
// GFX942 and GFX950 compatiblity
GROUP_NT0 = CU_RT,
GROUP_NT1 = CU_NT,
DEVICE_NT0 = DEVICE_RT,
DEVICE_NT1 = DEVICE_NT,
SYSTEM_NT0 = SYSTEM_RT,
SYSTEM_NT1 = SYSTEM_NT,
// Other archs compatiblity
glc = DEVICE_NT,
slc = SYSTEM_NT,
glc_slc = DEVICE_NT | SYSTEM_NT,
// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
#elif defined(__gfx942__) || defined(__gfx950__)
WAVE = 0,
GROUP = 1,
DEVICE = 16,
SYSTEM = 17,
NT0 = 0,
NT1 = 2,
WAVE_NT0 = NT0 | WAVE,
WAVE_NT1 = NT1 | WAVE,
GROUP_NT0 = NT0 | GROUP,
GROUP_NT1 = NT1 | GROUP,
DEVICE_NT0 = NT0 | DEVICE,
DEVICE_NT1 = NT1 | DEVICE,
SYSTEM_NT0 = NT0 | SYSTEM,
SYSTEM_NT1 = NT1 | SYSTEM,
// Other archs compatiblity
glc = DEVICE_NT1,
slc = SYSTEM_NT1,
glc_slc = DEVICE_NT1 | SYSTEM_NT1,
#else
glc = 1,
slc = 2,
glc_slc = 3,
// Other archs compatiblity
DEVICE_NT0 = 0,
SYSTEM_NT0 = 0,
DEVICE_NT1 = glc,
SYSTEM_NT1 = slc,
#endif
};
} // namespace ck_tile