From 407bdf7eb0395757123aca007e4543606de7038f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Thu, 18 Dec 2025 10:16:22 +0100 Subject: [PATCH] Update AMD buffer coherency (#3403) * Update AMD buffer coherency [AICK-421] * fixes * fix * fixes * fixes * Add backward compatilibity * fix * fixes * fix * fix * fix * Update grouped_convolution_backward_weight_kernel.hpp [ROCm/composable_kernel commit: 700b2ec9c02da8d367ebe8a223a6dbf16622db09] --- .../20_grouped_convolution/CMakeLists.txt | 2 +- ...tion_backward_weight_two_stage_invoker.hpp | 5 +- include/ck/utility/amd_buffer_addressing.hpp | 25 +--- .../amd_buffer_addressing_builtins.hpp | 25 +--- include/ck/utility/amd_buffer_coherence.hpp | 119 +++++++++++++++++ include/ck_tile/core.hpp | 1 + .../core/arch/amd_buffer_addressing.hpp | 25 +--- .../arch/amd_buffer_addressing_builtins.hpp | 25 +--- .../core/arch/amd_buffer_coherence.hpp | 124 ++++++++++++++++++ .../flatmm_pipeline_agmem_bgmem_creg_v1.hpp | 2 + ...ped_convolution_backward_weight_kernel.hpp | 13 ++ 11 files changed, 268 insertions(+), 98 deletions(-) create mode 100644 include/ck/utility/amd_buffer_coherence.hpp create mode 100644 include/ck_tile/core/arch/amd_buffer_coherence.hpp diff --git a/example/ck_tile/20_grouped_convolution/CMakeLists.txt b/example/ck_tile/20_grouped_convolution/CMakeLists.txt index 7fcca37bd9..090aae482b 100644 --- a/example/ck_tile/20_grouped_convolution/CMakeLists.txt +++ b/example/ck_tile/20_grouped_convolution/CMakeLists.txt @@ -1,7 +1,7 @@ # Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # SPDX-License-Identifier: MIT -if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a") +if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a|gfx11|gfx12") set(EXAMPLE_CONV_COMPILE_OPTIONS) list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0) diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp index 9221746560..ad5e8ae70f 100644 --- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp +++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp @@ -21,6 +21,9 @@ struct GroupedConvolutionBackwardWeightTwoStageInvoker const ck_tile::stream_config& s) { using WorkspaceDataType = float; + // Force Vector Size C to 1 for two stage to check main + // two stage use case + constexpr ck_tile::index_t VectorSizeC = 1; // Implicit GEMM Traits using GemmShape = ck_tile::TileGemmShape< @@ -39,7 +42,7 @@ struct GroupedConvolutionBackwardWeightTwoStageInvoker OutLayout, ConvConfig::VectorSizeA, ConvConfig::VectorSizeB, - ConvConfig::VectorSizeC, + VectorSizeC, ConvConfig::NumGroupsToMerge>; using TilePartitioner = ck_tile::GemmSpatiallyLocalTilePartitioner< diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp index f9404e00b7..3843c0c301 100644 --- a/include/ck/utility/amd_buffer_addressing.hpp +++ b/include/ck/utility/amd_buffer_addressing.hpp @@ -3,6 +3,7 @@ #pragma once #include "data_type.hpp" +#include "ck/utility/amd_buffer_coherence.hpp" namespace ck { @@ -286,30 +287,6 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, int soffset, // dst_wave_addr_offset int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64"); -// memory coherency bit for buffer store/load instruction -// check ISA manual for each GFX target -// e.g. for -// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf, -// page 67~68 -enum struct AmdBufferCoherenceEnum -{ - DefaultCoherence = 0, // default value - GLC = 1, - SLC = 2, - GLC_SLC = 3, - // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 - // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system - // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse - WAVE_NT0 = 0, - WAVE_NT1 = 2, - GROUP_NT0 = 1, - GROUP_NT1 = 3, - DEVICE_NT0 = 8, - DEVICE_NT1 = 10, - SYSTEM_NT0 = 9, - SYSTEM_NT1 = 11, -}; - template __device__ typename vector_type::type amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource, diff --git a/include/ck/utility/amd_buffer_addressing_builtins.hpp b/include/ck/utility/amd_buffer_addressing_builtins.hpp index cddb8b7e5c..f4ea9c80f1 100644 --- a/include/ck/utility/amd_buffer_addressing_builtins.hpp +++ b/include/ck/utility/amd_buffer_addressing_builtins.hpp @@ -3,6 +3,7 @@ #pragma once #include "data_type.hpp" +#include "ck/utility/amd_buffer_coherence.hpp" namespace ck { @@ -106,30 +107,6 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, int soffset, // dst_wave_addr_offset int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64"); -// memory coherency bit for buffer store/load instruction -// check ISA manual for each GFX target -// e.g. for -// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf, -// page 67~68 -enum struct AmdBufferCoherenceEnum -{ - DefaultCoherence = 0, // default value - GLC = 1, - SLC = 2, - GLC_SLC = 3, - // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 - // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system - // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse - WAVE_NT0 = 0, - WAVE_NT1 = 2, - GROUP_NT0 = 1, - GROUP_NT1 = 3, - DEVICE_NT0 = 8, - DEVICE_NT1 = 10, - SYSTEM_NT0 = 9, - SYSTEM_NT1 = 11, -}; - template __device__ typename vector_type::type amd_buffer_load_impl_raw(__amdgpu_buffer_rsrc_t src_wave_buffer_resource, diff --git a/include/ck/utility/amd_buffer_coherence.hpp b/include/ck/utility/amd_buffer_coherence.hpp new file mode 100644 index 0000000000..361f6d7b41 --- /dev/null +++ b/include/ck/utility/amd_buffer_coherence.hpp @@ -0,0 +1,119 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +namespace ck { + +enum struct AmdBufferCoherenceEnum +{ + DefaultCoherence = 0, // default value +#if defined(__gfx12__) + // Temporal hint + RT = 0, // regular temporal + NT = 1, // non temporal + HT = 2, // high priority temporal + LU = 3, // last use (load op) + WB = 3, // same as HT, overrides WR in far cache (store op) + NT_RT = 4, // non temporal for near cache, regular for far cache + RT_NT = 5, // regular for near cache, non-temporal for far cache + NT_HT = 6, // non temporal for near cache, high priority for far cache + NT_WB = 7, // non temporal for near cache, WB for far cache + // (store op, reserved for load op) + // Scope + CU = 0, + SE = 8, + DEVICE = 16, + SYSTEM = 24, + // Temporal Hint for CU + CU_RT = RT | CU, + CU_NT = NT | CU, + CU_HT = HT | CU, + CU_LU = LU | CU, + CU_WB = WB | CU, + CU_NT_RT = NT_RT | CU, + CU_RT_NT = RT_NT | CU, + CU_NT_HT = NT_HT | CU, + CU_NT_WB = NT_WB | CU, + // Temporal Hint for SE + SE_RT = RT | SE, + SE_NT = NT | SE, + SE_HT = HT | SE, + SE_LU = LU | SE, + SE_WB = WB | SE, + SE_NT_RT = NT_RT | SE, + SE_RT_NT = RT_NT | SE, + SE_NT_HT = NT_HT | SE, + SE_NT_WB = NT_WB | SE, + // Temporal Hint for DEVICE + DEVICE_RT = RT | DEVICE, + DEVICE_NT = NT | DEVICE, + DEVICE_HT = HT | DEVICE, + DEVICE_LU = LU | DEVICE, + DEVICE_WB = WB | DEVICE, + DEVICE_NT_RT = NT_RT | DEVICE, + DEVICE_RT_NT = RT_NT | DEVICE, + DEVICE_NT_HT = NT_HT | DEVICE, + DEVICE_NT_WB = NT_WB | DEVICE, + // Temporal Hint for SYSTEM + SYSTEM_RT = RT | SYSTEM, + SYSTEM_NT = NT | SYSTEM, + SYSTEM_HT = HT | SYSTEM, + SYSTEM_LU = LU | SYSTEM, + SYSTEM_WB = WB | SYSTEM, + SYSTEM_NT_RT = NT_RT | SYSTEM, + SYSTEM_RT_NT = RT_NT | SYSTEM, + SYSTEM_NT_HT = NT_HT | SYSTEM, + SYSTEM_NT_WB = NT_WB | SYSTEM, + + // GFX942 and GFX950 compatiblity + GROUP_NT0 = CU_RT, + GROUP_NT1 = CU_NT, + DEVICE_NT0 = DEVICE_RT, + DEVICE_NT1 = DEVICE_NT, + SYSTEM_NT0 = SYSTEM_RT, + SYSTEM_NT1 = SYSTEM_NT, + // Other archs compatiblity + GLC = DEVICE_NT, + SLC = SYSTEM_NT, + GLC_SLC = DEVICE_NT | SYSTEM_NT, + +// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 +// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system +// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse +#elif defined(__gfx942__) || defined(__gfx950__) + + WAVE = 0, + GROUP = 1, + DEVICE = 16, + SYSTEM = 17, + NT0 = 0, + NT1 = 2, + + WAVE_NT0 = NT0 | WAVE, + WAVE_NT1 = NT1 | WAVE, + GROUP_NT0 = NT0 | GROUP, + GROUP_NT1 = NT1 | GROUP, + DEVICE_NT0 = NT0 | DEVICE, + DEVICE_NT1 = NT1 | DEVICE, + SYSTEM_NT0 = NT0 | SYSTEM, + SYSTEM_NT1 = NT1 | SYSTEM, + + // Other archs compatiblity + GLC = DEVICE_NT1, + SLC = SYSTEM_NT1, + GLC_SLC = DEVICE_NT1 | SYSTEM_NT1, +#else + GLC = 1, + SLC = 2, + GLC_SLC = 3, + + // Other archs compatiblity + DEVICE_NT0 = 0, + SYSTEM_NT0 = 0, + DEVICE_NT1 = GLC, + SYSTEM_NT1 = SLC, +#endif +}; + +} // namespace ck diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp index d28d29a0ef..01e1d00b59 100644 --- a/include/ck_tile/core.hpp +++ b/include/ck_tile/core.hpp @@ -9,6 +9,7 @@ #include "ck_tile/core/algorithm/static_encoding_pattern.hpp" #include "ck_tile/core/arch/amd_buffer_addressing.hpp" #include "ck_tile/core/arch/amd_buffer_addressing_builtins.hpp" +#include "ck_tile/core/arch/amd_buffer_coherence.hpp" #include "ck_tile/core/arch/amd_transpose_load_encoding.hpp" #include "ck_tile/core/arch/arch.hpp" #include "ck_tile/core/arch/generic_memory_space_atomic.hpp" diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index 9f79bdbee6..7af2f558ad 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -16,6 +16,7 @@ #include "ck_tile/core/utility/bit_cast.hpp" #include "ck_tile/core/utility/functional.hpp" #include "ck_tile/core/utility/ignore.hpp" +#include "ck_tile/core/arch/amd_buffer_coherence.hpp" // This attribute gives a hint to the compiler that a branch is likely to be taken. // Then, the compiler should remove if possible the associated s_cbranch_execz branch that would @@ -1409,30 +1410,6 @@ CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0) asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory"); } -// memory coherency bit for buffer store/load instruction -// check ISA manual for each GFX target -// e.g. for -// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf, -// page 67~68 -enum struct amd_buffer_coherence_enum -{ - coherence_default = 0, // default value - glc = 1, - slc = 2, - glc_slc = 3, - // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 - // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system - // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse - WAVE_NT0 = 0, - WAVE_NT1 = 2, - GROUP_NT0 = 1, - GROUP_NT1 = 3, - DEVICE_NT0 = 16, - DEVICE_NT1 = 18, - SYSTEM_NT0 = 17, - SYSTEM_NT1 = 19, -}; - template CK_TILE_DEVICE thread_buffer diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp index 4627b249d6..562b246ac3 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp @@ -16,6 +16,7 @@ #include "ck_tile/core/utility/bit_cast.hpp" #include "ck_tile/core/utility/functional.hpp" #include "ck_tile/core/utility/ignore.hpp" +#include "ck_tile/core/arch/amd_buffer_coherence.hpp" using as3_uint32_ptr = uint32_t __attribute__((address_space(3)))*; @@ -1277,30 +1278,6 @@ CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0) asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory"); } -// memory coherency bit for buffer store/load instruction -// check ISA manual for each GFX target -// e.g. for -// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf, -// page 67~68 -enum struct amd_buffer_coherence_enum -{ - coherence_default = 0, // default value - glc = 1, - slc = 2, - glc_slc = 3, - // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 - // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system - // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse - WAVE_NT0 = 0, - WAVE_NT1 = 2, - GROUP_NT0 = 1, - GROUP_NT1 = 3, - DEVICE_NT0 = 16, - DEVICE_NT1 = 18, - SYSTEM_NT0 = 17, - SYSTEM_NT1 = 19, -}; - template CK_TILE_DEVICE thread_buffer diff --git a/include/ck_tile/core/arch/amd_buffer_coherence.hpp b/include/ck_tile/core/arch/amd_buffer_coherence.hpp new file mode 100644 index 0000000000..2e38c77291 --- /dev/null +++ b/include/ck_tile/core/arch/amd_buffer_coherence.hpp @@ -0,0 +1,124 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +namespace ck_tile { + +// memory coherency bit for buffer store/load instruction +// check ISA manual for each GFX target +// e.g. for +// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf, +// page 67~68 +enum struct amd_buffer_coherence_enum +{ + coherence_default = 0, // default value +#if defined(__gfx12__) + // Temporal hint + RT = 0, // regular temporal + NT = 1, // non temporal + HT = 2, // high priority temporal + LU = 3, // last use (load op) + WB = 3, // same as HT, overrides WR in far cache (store op) + NT_RT = 4, // non temporal for near cache, regular for far cache + RT_NT = 5, // regular for near cache, non-temporal for far cache + NT_HT = 6, // non temporal for near cache, high priority for far cache + NT_WB = 7, // non temporal for near cache, WB for far cache + // (store op, reserved for load op) + // Scope + CU = 0, + SE = 8, + DEVICE = 16, + SYSTEM = 24, + // Temporal Hint for CU + CU_RT = RT | CU, + CU_NT = NT | CU, + CU_HT = HT | CU, + CU_LU = LU | CU, + CU_WB = WB | CU, + CU_NT_RT = NT_RT | CU, + CU_RT_NT = RT_NT | CU, + CU_NT_HT = NT_HT | CU, + CU_NT_WB = NT_WB | CU, + // Temporal Hint for SE + SE_RT = RT | SE, + SE_NT = NT | SE, + SE_HT = HT | SE, + SE_LU = LU | SE, + SE_WB = WB | SE, + SE_NT_RT = NT_RT | SE, + SE_RT_NT = RT_NT | SE, + SE_NT_HT = NT_HT | SE, + SE_NT_WB = NT_WB | SE, + // Temporal Hint for DEVICE + DEVICE_RT = RT | DEVICE, + DEVICE_NT = NT | DEVICE, + DEVICE_HT = HT | DEVICE, + DEVICE_LU = LU | DEVICE, + DEVICE_WB = WB | DEVICE, + DEVICE_NT_RT = NT_RT | DEVICE, + DEVICE_RT_NT = RT_NT | DEVICE, + DEVICE_NT_HT = NT_HT | DEVICE, + DEVICE_NT_WB = NT_WB | DEVICE, + // Temporal Hint for SYSTEM + SYSTEM_RT = RT | SYSTEM, + SYSTEM_NT = NT | SYSTEM, + SYSTEM_HT = HT | SYSTEM, + SYSTEM_LU = LU | SYSTEM, + SYSTEM_WB = WB | SYSTEM, + SYSTEM_NT_RT = NT_RT | SYSTEM, + SYSTEM_RT_NT = RT_NT | SYSTEM, + SYSTEM_NT_HT = NT_HT | SYSTEM, + SYSTEM_NT_WB = NT_WB | SYSTEM, + + // GFX942 and GFX950 compatiblity + GROUP_NT0 = CU_RT, + GROUP_NT1 = CU_NT, + DEVICE_NT0 = DEVICE_RT, + DEVICE_NT1 = DEVICE_NT, + SYSTEM_NT0 = SYSTEM_RT, + SYSTEM_NT1 = SYSTEM_NT, + // Other archs compatiblity + glc = DEVICE_NT, + slc = SYSTEM_NT, + glc_slc = DEVICE_NT | SYSTEM_NT, + +// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 +// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system +// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse +#elif defined(__gfx942__) || defined(__gfx950__) + + WAVE = 0, + GROUP = 1, + DEVICE = 16, + SYSTEM = 17, + NT0 = 0, + NT1 = 2, + + WAVE_NT0 = NT0 | WAVE, + WAVE_NT1 = NT1 | WAVE, + GROUP_NT0 = NT0 | GROUP, + GROUP_NT1 = NT1 | GROUP, + DEVICE_NT0 = NT0 | DEVICE, + DEVICE_NT1 = NT1 | DEVICE, + SYSTEM_NT0 = NT0 | SYSTEM, + SYSTEM_NT1 = NT1 | SYSTEM, + + // Other archs compatiblity + glc = DEVICE_NT1, + slc = SYSTEM_NT1, + glc_slc = DEVICE_NT1 | SYSTEM_NT1, +#else + glc = 1, + slc = 2, + glc_slc = 3, + + // Other archs compatiblity + DEVICE_NT0 = 0, + SYSTEM_NT0 = 0, + DEVICE_NT1 = glc, + SYSTEM_NT1 = slc, +#endif +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp index e4f186dead..8ad99e9399 100644 --- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp +++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp @@ -31,7 +31,9 @@ struct BaseFlatmmPipelineAGmemBGmemCRegV1 ck_tile::ignore = K; if(M <= 416) { +#if defined(__gfx942__) || defined(__gfx950__) return ck_tile::amd_buffer_coherence_enum::WAVE_NT1; +#endif } return ck_tile::amd_buffer_coherence_enum::coherence_default; } diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp index 2e80ff64c1..1004ed81b1 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp @@ -545,6 +545,13 @@ struct GroupedConvolutionBackwardWeightKernel return false; } +#if defined(__gfx11__) + if constexpr(EpiloguePipeline::MemoryOperation != ck_tile::memory_operation_enum::set) + { + return false; + } +#endif + if constexpr(EpiloguePipeline_::MemoryOperation == memory_operation_enum::atomic_add) { if(kargs.k_batch == 1) @@ -971,6 +978,12 @@ struct GroupedConvolutionBackwardWeightKernel CK_TILE_DEVICE void operator()(GroupedConvBwdWeightKernelArgsSpecialized& kargs) const { +#if defined(__gfx11__) + if constexpr(EpiloguePipeline::MemoryOperation != ck_tile::memory_operation_enum::set) + { + return; + } +#endif if constexpr(GroupedConvTraitsType_::ExplicitGemm) { CallExplicitGemm(kargs);