Update AMD buffer coherency (#3403)

* Update AMD buffer coherency [AICK-421]

* fixes

* fix

* fixes

* fixes

* Add backward compatilibity

* fix

* fixes

* fix

* fix

* fix

* Update grouped_convolution_backward_weight_kernel.hpp

[ROCm/composable_kernel commit: 700b2ec9c0]
This commit is contained in:
Bartłomiej Kocot
2025-12-18 10:16:22 +01:00
committed by GitHub
parent 1cc7d01ea8
commit 407bdf7eb0
11 changed files with 268 additions and 98 deletions

View File

@@ -1,7 +1,7 @@
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a")
if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a|gfx11|gfx12")
set(EXAMPLE_CONV_COMPILE_OPTIONS)
list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)

View File

@@ -21,6 +21,9 @@ struct GroupedConvolutionBackwardWeightTwoStageInvoker
const ck_tile::stream_config& s)
{
using WorkspaceDataType = float;
// Force Vector Size C to 1 for two stage to check main
// two stage use case
constexpr ck_tile::index_t VectorSizeC = 1;
// Implicit GEMM Traits
using GemmShape = ck_tile::TileGemmShape<
@@ -39,7 +42,7 @@ struct GroupedConvolutionBackwardWeightTwoStageInvoker
OutLayout,
ConvConfig::VectorSizeA,
ConvConfig::VectorSizeB,
ConvConfig::VectorSizeC,
VectorSizeC,
ConvConfig::NumGroupsToMerge>;
using TilePartitioner = ck_tile::GemmSpatiallyLocalTilePartitioner<

View File

@@ -3,6 +3,7 @@
#pragma once
#include "data_type.hpp"
#include "ck/utility/amd_buffer_coherence.hpp"
namespace ck {
@@ -286,30 +287,6 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
int soffset, // dst_wave_addr_offset
int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");
// memory coherency bit for buffer store/load instruction
// check ISA manual for each GFX target
// e.g. for
// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
// page 67~68
enum struct AmdBufferCoherenceEnum
{
DefaultCoherence = 0, // default value
GLC = 1,
SLC = 2,
GLC_SLC = 3,
// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
WAVE_NT0 = 0,
WAVE_NT1 = 2,
GROUP_NT0 = 1,
GROUP_NT1 = 3,
DEVICE_NT0 = 8,
DEVICE_NT1 = 10,
SYSTEM_NT0 = 9,
SYSTEM_NT1 = 11,
};
template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ typename vector_type<int8_t, N>::type
amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource,

View File

@@ -3,6 +3,7 @@
#pragma once
#include "data_type.hpp"
#include "ck/utility/amd_buffer_coherence.hpp"
namespace ck {
@@ -106,30 +107,6 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
int soffset, // dst_wave_addr_offset
int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");
// memory coherency bit for buffer store/load instruction
// check ISA manual for each GFX target
// e.g. for
// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
// page 67~68
enum struct AmdBufferCoherenceEnum
{
DefaultCoherence = 0, // default value
GLC = 1,
SLC = 2,
GLC_SLC = 3,
// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
WAVE_NT0 = 0,
WAVE_NT1 = 2,
GROUP_NT0 = 1,
GROUP_NT1 = 3,
DEVICE_NT0 = 8,
DEVICE_NT1 = 10,
SYSTEM_NT0 = 9,
SYSTEM_NT1 = 11,
};
template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ typename vector_type<int8_t, N>::type
amd_buffer_load_impl_raw(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,

View File

@@ -0,0 +1,119 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
namespace ck {
enum struct AmdBufferCoherenceEnum
{
DefaultCoherence = 0, // default value
#if defined(__gfx12__)
// Temporal hint
RT = 0, // regular temporal
NT = 1, // non temporal
HT = 2, // high priority temporal
LU = 3, // last use (load op)
WB = 3, // same as HT, overrides WR in far cache (store op)
NT_RT = 4, // non temporal for near cache, regular for far cache
RT_NT = 5, // regular for near cache, non-temporal for far cache
NT_HT = 6, // non temporal for near cache, high priority for far cache
NT_WB = 7, // non temporal for near cache, WB for far cache
// (store op, reserved for load op)
// Scope
CU = 0,
SE = 8,
DEVICE = 16,
SYSTEM = 24,
// Temporal Hint for CU
CU_RT = RT | CU,
CU_NT = NT | CU,
CU_HT = HT | CU,
CU_LU = LU | CU,
CU_WB = WB | CU,
CU_NT_RT = NT_RT | CU,
CU_RT_NT = RT_NT | CU,
CU_NT_HT = NT_HT | CU,
CU_NT_WB = NT_WB | CU,
// Temporal Hint for SE
SE_RT = RT | SE,
SE_NT = NT | SE,
SE_HT = HT | SE,
SE_LU = LU | SE,
SE_WB = WB | SE,
SE_NT_RT = NT_RT | SE,
SE_RT_NT = RT_NT | SE,
SE_NT_HT = NT_HT | SE,
SE_NT_WB = NT_WB | SE,
// Temporal Hint for DEVICE
DEVICE_RT = RT | DEVICE,
DEVICE_NT = NT | DEVICE,
DEVICE_HT = HT | DEVICE,
DEVICE_LU = LU | DEVICE,
DEVICE_WB = WB | DEVICE,
DEVICE_NT_RT = NT_RT | DEVICE,
DEVICE_RT_NT = RT_NT | DEVICE,
DEVICE_NT_HT = NT_HT | DEVICE,
DEVICE_NT_WB = NT_WB | DEVICE,
// Temporal Hint for SYSTEM
SYSTEM_RT = RT | SYSTEM,
SYSTEM_NT = NT | SYSTEM,
SYSTEM_HT = HT | SYSTEM,
SYSTEM_LU = LU | SYSTEM,
SYSTEM_WB = WB | SYSTEM,
SYSTEM_NT_RT = NT_RT | SYSTEM,
SYSTEM_RT_NT = RT_NT | SYSTEM,
SYSTEM_NT_HT = NT_HT | SYSTEM,
SYSTEM_NT_WB = NT_WB | SYSTEM,
// GFX942 and GFX950 compatiblity
GROUP_NT0 = CU_RT,
GROUP_NT1 = CU_NT,
DEVICE_NT0 = DEVICE_RT,
DEVICE_NT1 = DEVICE_NT,
SYSTEM_NT0 = SYSTEM_RT,
SYSTEM_NT1 = SYSTEM_NT,
// Other archs compatiblity
GLC = DEVICE_NT,
SLC = SYSTEM_NT,
GLC_SLC = DEVICE_NT | SYSTEM_NT,
// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
#elif defined(__gfx942__) || defined(__gfx950__)
WAVE = 0,
GROUP = 1,
DEVICE = 16,
SYSTEM = 17,
NT0 = 0,
NT1 = 2,
WAVE_NT0 = NT0 | WAVE,
WAVE_NT1 = NT1 | WAVE,
GROUP_NT0 = NT0 | GROUP,
GROUP_NT1 = NT1 | GROUP,
DEVICE_NT0 = NT0 | DEVICE,
DEVICE_NT1 = NT1 | DEVICE,
SYSTEM_NT0 = NT0 | SYSTEM,
SYSTEM_NT1 = NT1 | SYSTEM,
// Other archs compatiblity
GLC = DEVICE_NT1,
SLC = SYSTEM_NT1,
GLC_SLC = DEVICE_NT1 | SYSTEM_NT1,
#else
GLC = 1,
SLC = 2,
GLC_SLC = 3,
// Other archs compatiblity
DEVICE_NT0 = 0,
SYSTEM_NT0 = 0,
DEVICE_NT1 = GLC,
SYSTEM_NT1 = SLC,
#endif
};
} // namespace ck

View File

@@ -9,6 +9,7 @@
#include "ck_tile/core/algorithm/static_encoding_pattern.hpp"
#include "ck_tile/core/arch/amd_buffer_addressing.hpp"
#include "ck_tile/core/arch/amd_buffer_addressing_builtins.hpp"
#include "ck_tile/core/arch/amd_buffer_coherence.hpp"
#include "ck_tile/core/arch/amd_transpose_load_encoding.hpp"
#include "ck_tile/core/arch/arch.hpp"
#include "ck_tile/core/arch/generic_memory_space_atomic.hpp"

View File

@@ -16,6 +16,7 @@
#include "ck_tile/core/utility/bit_cast.hpp"
#include "ck_tile/core/utility/functional.hpp"
#include "ck_tile/core/utility/ignore.hpp"
#include "ck_tile/core/arch/amd_buffer_coherence.hpp"
// This attribute gives a hint to the compiler that a branch is likely to be taken.
// Then, the compiler should remove if possible the associated s_cbranch_execz branch that would
@@ -1409,30 +1410,6 @@ CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
}
// memory coherency bit for buffer store/load instruction
// check ISA manual for each GFX target
// e.g. for
// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
// page 67~68
enum struct amd_buffer_coherence_enum
{
coherence_default = 0, // default value
glc = 1,
slc = 2,
glc_slc = 3,
// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
WAVE_NT0 = 0,
WAVE_NT1 = 2,
GROUP_NT0 = 1,
GROUP_NT1 = 3,
DEVICE_NT0 = 16,
DEVICE_NT1 = 18,
SYSTEM_NT0 = 17,
SYSTEM_NT1 = 19,
};
template <index_t N,
amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE thread_buffer<int8_t, N>

View File

@@ -16,6 +16,7 @@
#include "ck_tile/core/utility/bit_cast.hpp"
#include "ck_tile/core/utility/functional.hpp"
#include "ck_tile/core/utility/ignore.hpp"
#include "ck_tile/core/arch/amd_buffer_coherence.hpp"
using as3_uint32_ptr = uint32_t __attribute__((address_space(3)))*;
@@ -1277,30 +1278,6 @@ CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
}
// memory coherency bit for buffer store/load instruction
// check ISA manual for each GFX target
// e.g. for
// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
// page 67~68
enum struct amd_buffer_coherence_enum
{
coherence_default = 0, // default value
glc = 1,
slc = 2,
glc_slc = 3,
// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
WAVE_NT0 = 0,
WAVE_NT1 = 2,
GROUP_NT0 = 1,
GROUP_NT1 = 3,
DEVICE_NT0 = 16,
DEVICE_NT1 = 18,
SYSTEM_NT0 = 17,
SYSTEM_NT1 = 19,
};
template <index_t N,
amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
CK_TILE_DEVICE thread_buffer<int8_t, N>

View File

@@ -0,0 +1,124 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
namespace ck_tile {
// memory coherency bit for buffer store/load instruction
// check ISA manual for each GFX target
// e.g. for
// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
// page 67~68
enum struct amd_buffer_coherence_enum
{
coherence_default = 0, // default value
#if defined(__gfx12__)
// Temporal hint
RT = 0, // regular temporal
NT = 1, // non temporal
HT = 2, // high priority temporal
LU = 3, // last use (load op)
WB = 3, // same as HT, overrides WR in far cache (store op)
NT_RT = 4, // non temporal for near cache, regular for far cache
RT_NT = 5, // regular for near cache, non-temporal for far cache
NT_HT = 6, // non temporal for near cache, high priority for far cache
NT_WB = 7, // non temporal for near cache, WB for far cache
// (store op, reserved for load op)
// Scope
CU = 0,
SE = 8,
DEVICE = 16,
SYSTEM = 24,
// Temporal Hint for CU
CU_RT = RT | CU,
CU_NT = NT | CU,
CU_HT = HT | CU,
CU_LU = LU | CU,
CU_WB = WB | CU,
CU_NT_RT = NT_RT | CU,
CU_RT_NT = RT_NT | CU,
CU_NT_HT = NT_HT | CU,
CU_NT_WB = NT_WB | CU,
// Temporal Hint for SE
SE_RT = RT | SE,
SE_NT = NT | SE,
SE_HT = HT | SE,
SE_LU = LU | SE,
SE_WB = WB | SE,
SE_NT_RT = NT_RT | SE,
SE_RT_NT = RT_NT | SE,
SE_NT_HT = NT_HT | SE,
SE_NT_WB = NT_WB | SE,
// Temporal Hint for DEVICE
DEVICE_RT = RT | DEVICE,
DEVICE_NT = NT | DEVICE,
DEVICE_HT = HT | DEVICE,
DEVICE_LU = LU | DEVICE,
DEVICE_WB = WB | DEVICE,
DEVICE_NT_RT = NT_RT | DEVICE,
DEVICE_RT_NT = RT_NT | DEVICE,
DEVICE_NT_HT = NT_HT | DEVICE,
DEVICE_NT_WB = NT_WB | DEVICE,
// Temporal Hint for SYSTEM
SYSTEM_RT = RT | SYSTEM,
SYSTEM_NT = NT | SYSTEM,
SYSTEM_HT = HT | SYSTEM,
SYSTEM_LU = LU | SYSTEM,
SYSTEM_WB = WB | SYSTEM,
SYSTEM_NT_RT = NT_RT | SYSTEM,
SYSTEM_RT_NT = RT_NT | SYSTEM,
SYSTEM_NT_HT = NT_HT | SYSTEM,
SYSTEM_NT_WB = NT_WB | SYSTEM,
// GFX942 and GFX950 compatiblity
GROUP_NT0 = CU_RT,
GROUP_NT1 = CU_NT,
DEVICE_NT0 = DEVICE_RT,
DEVICE_NT1 = DEVICE_NT,
SYSTEM_NT0 = SYSTEM_RT,
SYSTEM_NT1 = SYSTEM_NT,
// Other archs compatiblity
glc = DEVICE_NT,
slc = SYSTEM_NT,
glc_slc = DEVICE_NT | SYSTEM_NT,
// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
#elif defined(__gfx942__) || defined(__gfx950__)
WAVE = 0,
GROUP = 1,
DEVICE = 16,
SYSTEM = 17,
NT0 = 0,
NT1 = 2,
WAVE_NT0 = NT0 | WAVE,
WAVE_NT1 = NT1 | WAVE,
GROUP_NT0 = NT0 | GROUP,
GROUP_NT1 = NT1 | GROUP,
DEVICE_NT0 = NT0 | DEVICE,
DEVICE_NT1 = NT1 | DEVICE,
SYSTEM_NT0 = NT0 | SYSTEM,
SYSTEM_NT1 = NT1 | SYSTEM,
// Other archs compatiblity
glc = DEVICE_NT1,
slc = SYSTEM_NT1,
glc_slc = DEVICE_NT1 | SYSTEM_NT1,
#else
glc = 1,
slc = 2,
glc_slc = 3,
// Other archs compatiblity
DEVICE_NT0 = 0,
SYSTEM_NT0 = 0,
DEVICE_NT1 = glc,
SYSTEM_NT1 = slc,
#endif
};
} // namespace ck_tile

View File

@@ -31,7 +31,9 @@ struct BaseFlatmmPipelineAGmemBGmemCRegV1
ck_tile::ignore = K;
if(M <= 416)
{
#if defined(__gfx942__) || defined(__gfx950__)
return ck_tile::amd_buffer_coherence_enum::WAVE_NT1;
#endif
}
return ck_tile::amd_buffer_coherence_enum::coherence_default;
}

View File

@@ -545,6 +545,13 @@ struct GroupedConvolutionBackwardWeightKernel
return false;
}
#if defined(__gfx11__)
if constexpr(EpiloguePipeline::MemoryOperation != ck_tile::memory_operation_enum::set)
{
return false;
}
#endif
if constexpr(EpiloguePipeline_::MemoryOperation == memory_operation_enum::atomic_add)
{
if(kargs.k_batch == 1)
@@ -971,6 +978,12 @@ struct GroupedConvolutionBackwardWeightKernel
CK_TILE_DEVICE void operator()(GroupedConvBwdWeightKernelArgsSpecialized& kargs) const
{
#if defined(__gfx11__)
if constexpr(EpiloguePipeline::MemoryOperation != ck_tile::memory_operation_enum::set)
{
return;
}
#endif
if constexpr(GroupedConvTraitsType_::ExplicitGemm)
{
CallExplicitGemm(kargs);