Update AMD buffer coherency (#3403)

* Update AMD buffer coherency [AICK-421] * fixes * fix * fixes * fixes * Add backward compatilibity * fix * fixes * fix * fix * fix * Update grouped_convolution_backward_weight_kernel.hpp [ROCm/composable_kernel commit: 700b2ec9c0]
2026-07-19 02:01:01 +00:00 · 2025-12-18 10:16:22 +01:00
parent 1cc7d01ea8
commit 407bdf7eb0
11 changed files with 268 additions and 98 deletions
--- a/example/ck_tile/20_grouped_convolution/CMakeLists.txt
+++ b/example/ck_tile/20_grouped_convolution/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT

-if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a")
+if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a|gfx11|gfx12")
  set(EXAMPLE_CONV_COMPILE_OPTIONS)
  list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)

--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp
@@ -21,6 +21,9 @@ struct GroupedConvolutionBackwardWeightTwoStageInvoker
                                                 const ck_tile::stream_config& s)
    {
        using WorkspaceDataType = float;
+        // Force Vector Size C to 1 for two stage to check main
+        // two stage use case
+        constexpr ck_tile::index_t VectorSizeC = 1;

        // Implicit GEMM Traits
        using GemmShape = ck_tile::TileGemmShape<
@@ -39,7 +42,7 @@ struct GroupedConvolutionBackwardWeightTwoStageInvoker
                                                                 OutLayout,
                                                                 ConvConfig::VectorSizeA,
                                                                 ConvConfig::VectorSizeB,
-                                                                 ConvConfig::VectorSizeC,
+                                                                 VectorSizeC,
                                                                 ConvConfig::NumGroupsToMerge>;

        using TilePartitioner = ck_tile::GemmSpatiallyLocalTilePartitioner<
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -3,6 +3,7 @@

 #pragma once
 #include "data_type.hpp"
+#include "ck/utility/amd_buffer_coherence.hpp"

 namespace ck {

@@ -286,30 +287,6 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
                                       int soffset,    // dst_wave_addr_offset
                                       int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");

-// memory coherency bit for buffer store/load instruction
-// check ISA manual for each GFX target
-// e.g. for
-// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
-// page 67~68
-enum struct AmdBufferCoherenceEnum
-{
-    DefaultCoherence = 0, // default value
-    GLC              = 1,
-    SLC              = 2,
-    GLC_SLC          = 3,
-    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
-    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
-    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
-    WAVE_NT0   = 0,
-    WAVE_NT1   = 2,
-    GROUP_NT0  = 1,
-    GROUP_NT1  = 3,
-    DEVICE_NT0 = 8,
-    DEVICE_NT1 = 10,
-    SYSTEM_NT0 = 9,
-    SYSTEM_NT1 = 11,
-};
-
 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
 __device__ typename vector_type<int8_t, N>::type
 amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource,
--- a/include/ck/utility/amd_buffer_addressing_builtins.hpp
+++ b/include/ck/utility/amd_buffer_addressing_builtins.hpp
@@ -3,6 +3,7 @@

 #pragma once
 #include "data_type.hpp"
+#include "ck/utility/amd_buffer_coherence.hpp"

 namespace ck {

@@ -106,30 +107,6 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
                                       int soffset,    // dst_wave_addr_offset
                                       int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");

-// memory coherency bit for buffer store/load instruction
-// check ISA manual for each GFX target
-// e.g. for
-// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
-// page 67~68
-enum struct AmdBufferCoherenceEnum
-{
-    DefaultCoherence = 0, // default value
-    GLC              = 1,
-    SLC              = 2,
-    GLC_SLC          = 3,
-    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
-    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
-    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
-    WAVE_NT0   = 0,
-    WAVE_NT1   = 2,
-    GROUP_NT0  = 1,
-    GROUP_NT1  = 3,
-    DEVICE_NT0 = 8,
-    DEVICE_NT1 = 10,
-    SYSTEM_NT0 = 9,
-    SYSTEM_NT1 = 11,
-};
-
 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
 __device__ typename vector_type<int8_t, N>::type
 amd_buffer_load_impl_raw(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
--- a/include/ck/utility/amd_buffer_coherence.hpp
+++ b/include/ck/utility/amd_buffer_coherence.hpp
@@ -0,0 +1,119 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+namespace ck {
+
+enum struct AmdBufferCoherenceEnum
+{
+    DefaultCoherence = 0, // default value
+#if defined(__gfx12__)
+    // Temporal hint
+    RT    = 0, // regular temporal
+    NT    = 1, // non temporal
+    HT    = 2, // high priority temporal
+    LU    = 3, // last use (load op)
+    WB    = 3, // same as HT, overrides WR in far cache (store op)
+    NT_RT = 4, // non temporal for near cache, regular for far cache
+    RT_NT = 5, // regular for near cache, non-temporal for far cache
+    NT_HT = 6, // non temporal for near cache, high priority for far cache
+    NT_WB = 7, // non temporal for near cache, WB for far cache
+               // (store op, reserved for load op)
+    // Scope
+    CU     = 0,
+    SE     = 8,
+    DEVICE = 16,
+    SYSTEM = 24,
+    // Temporal Hint for CU
+    CU_RT    = RT | CU,
+    CU_NT    = NT | CU,
+    CU_HT    = HT | CU,
+    CU_LU    = LU | CU,
+    CU_WB    = WB | CU,
+    CU_NT_RT = NT_RT | CU,
+    CU_RT_NT = RT_NT | CU,
+    CU_NT_HT = NT_HT | CU,
+    CU_NT_WB = NT_WB | CU,
+    // Temporal Hint for SE
+    SE_RT    = RT | SE,
+    SE_NT    = NT | SE,
+    SE_HT    = HT | SE,
+    SE_LU    = LU | SE,
+    SE_WB    = WB | SE,
+    SE_NT_RT = NT_RT | SE,
+    SE_RT_NT = RT_NT | SE,
+    SE_NT_HT = NT_HT | SE,
+    SE_NT_WB = NT_WB | SE,
+    // Temporal Hint for DEVICE
+    DEVICE_RT    = RT | DEVICE,
+    DEVICE_NT    = NT | DEVICE,
+    DEVICE_HT    = HT | DEVICE,
+    DEVICE_LU    = LU | DEVICE,
+    DEVICE_WB    = WB | DEVICE,
+    DEVICE_NT_RT = NT_RT | DEVICE,
+    DEVICE_RT_NT = RT_NT | DEVICE,
+    DEVICE_NT_HT = NT_HT | DEVICE,
+    DEVICE_NT_WB = NT_WB | DEVICE,
+    // Temporal Hint for SYSTEM
+    SYSTEM_RT    = RT | SYSTEM,
+    SYSTEM_NT    = NT | SYSTEM,
+    SYSTEM_HT    = HT | SYSTEM,
+    SYSTEM_LU    = LU | SYSTEM,
+    SYSTEM_WB    = WB | SYSTEM,
+    SYSTEM_NT_RT = NT_RT | SYSTEM,
+    SYSTEM_RT_NT = RT_NT | SYSTEM,
+    SYSTEM_NT_HT = NT_HT | SYSTEM,
+    SYSTEM_NT_WB = NT_WB | SYSTEM,
+
+    // GFX942 and GFX950 compatiblity
+    GROUP_NT0  = CU_RT,
+    GROUP_NT1  = CU_NT,
+    DEVICE_NT0 = DEVICE_RT,
+    DEVICE_NT1 = DEVICE_NT,
+    SYSTEM_NT0 = SYSTEM_RT,
+    SYSTEM_NT1 = SYSTEM_NT,
+    // Other archs compatiblity
+    GLC     = DEVICE_NT,
+    SLC     = SYSTEM_NT,
+    GLC_SLC = DEVICE_NT | SYSTEM_NT,
+
+// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
+// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
+#elif defined(__gfx942__) || defined(__gfx950__)
+
+    WAVE   = 0,
+    GROUP  = 1,
+    DEVICE = 16,
+    SYSTEM = 17,
+    NT0    = 0,
+    NT1    = 2,
+
+    WAVE_NT0   = NT0 | WAVE,
+    WAVE_NT1   = NT1 | WAVE,
+    GROUP_NT0  = NT0 | GROUP,
+    GROUP_NT1  = NT1 | GROUP,
+    DEVICE_NT0 = NT0 | DEVICE,
+    DEVICE_NT1 = NT1 | DEVICE,
+    SYSTEM_NT0 = NT0 | SYSTEM,
+    SYSTEM_NT1 = NT1 | SYSTEM,
+
+    // Other archs compatiblity
+    GLC     = DEVICE_NT1,
+    SLC     = SYSTEM_NT1,
+    GLC_SLC = DEVICE_NT1 | SYSTEM_NT1,
+#else
+    GLC     = 1,
+    SLC     = 2,
+    GLC_SLC = 3,
+
+    // Other archs compatiblity
+    DEVICE_NT0 = 0,
+    SYSTEM_NT0 = 0,
+    DEVICE_NT1 = GLC,
+    SYSTEM_NT1 = SLC,
+#endif
+};
+
+} // namespace ck
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -9,6 +9,7 @@
 #include "ck_tile/core/algorithm/static_encoding_pattern.hpp"
 #include "ck_tile/core/arch/amd_buffer_addressing.hpp"
 #include "ck_tile/core/arch/amd_buffer_addressing_builtins.hpp"
+#include "ck_tile/core/arch/amd_buffer_coherence.hpp"
 #include "ck_tile/core/arch/amd_transpose_load_encoding.hpp"
 #include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/core/arch/generic_memory_space_atomic.hpp"
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -16,6 +16,7 @@
 #include "ck_tile/core/utility/bit_cast.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/ignore.hpp"
+#include "ck_tile/core/arch/amd_buffer_coherence.hpp"

 // This attribute gives a hint to the compiler that a branch is likely to be taken.
 // Then, the compiler should remove if possible the associated s_cbranch_execz branch that would
@@ -1409,30 +1410,6 @@ CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
    asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
 }

-// memory coherency bit for buffer store/load instruction
-// check ISA manual for each GFX target
-// e.g. for
-// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
-// page 67~68
-enum struct amd_buffer_coherence_enum
-{
-    coherence_default = 0, // default value
-    glc               = 1,
-    slc               = 2,
-    glc_slc           = 3,
-    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
-    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
-    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
-    WAVE_NT0   = 0,
-    WAVE_NT1   = 2,
-    GROUP_NT0  = 1,
-    GROUP_NT1  = 3,
-    DEVICE_NT0 = 16,
-    DEVICE_NT1 = 18,
-    SYSTEM_NT0 = 17,
-    SYSTEM_NT1 = 19,
-};
-
 template <index_t N,
          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
 CK_TILE_DEVICE thread_buffer<int8_t, N>
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -16,6 +16,7 @@
 #include "ck_tile/core/utility/bit_cast.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/ignore.hpp"
+#include "ck_tile/core/arch/amd_buffer_coherence.hpp"

 using as3_uint32_ptr = uint32_t __attribute__((address_space(3)))*;

@@ -1277,30 +1278,6 @@ CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
    asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
 }

-// memory coherency bit for buffer store/load instruction
-// check ISA manual for each GFX target
-// e.g. for
-// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
-// page 67~68
-enum struct amd_buffer_coherence_enum
-{
-    coherence_default = 0, // default value
-    glc               = 1,
-    slc               = 2,
-    glc_slc           = 3,
-    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
-    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
-    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
-    WAVE_NT0   = 0,
-    WAVE_NT1   = 2,
-    GROUP_NT0  = 1,
-    GROUP_NT1  = 3,
-    DEVICE_NT0 = 16,
-    DEVICE_NT1 = 18,
-    SYSTEM_NT0 = 17,
-    SYSTEM_NT1 = 19,
-};
-
 template <index_t N,
          amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
 CK_TILE_DEVICE thread_buffer<int8_t, N>
--- a/include/ck_tile/core/arch/amd_buffer_coherence.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_coherence.hpp
@@ -0,0 +1,124 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+namespace ck_tile {
+
+// memory coherency bit for buffer store/load instruction
+// check ISA manual for each GFX target
+// e.g. for
+// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
+// page 67~68
+enum struct amd_buffer_coherence_enum
+{
+    coherence_default = 0, // default value
+#if defined(__gfx12__)
+    // Temporal hint
+    RT    = 0, // regular temporal
+    NT    = 1, // non temporal
+    HT    = 2, // high priority temporal
+    LU    = 3, // last use (load op)
+    WB    = 3, // same as HT, overrides WR in far cache (store op)
+    NT_RT = 4, // non temporal for near cache, regular for far cache
+    RT_NT = 5, // regular for near cache, non-temporal for far cache
+    NT_HT = 6, // non temporal for near cache, high priority for far cache
+    NT_WB = 7, // non temporal for near cache, WB for far cache
+               // (store op, reserved for load op)
+    // Scope
+    CU     = 0,
+    SE     = 8,
+    DEVICE = 16,
+    SYSTEM = 24,
+    // Temporal Hint for CU
+    CU_RT    = RT | CU,
+    CU_NT    = NT | CU,
+    CU_HT    = HT | CU,
+    CU_LU    = LU | CU,
+    CU_WB    = WB | CU,
+    CU_NT_RT = NT_RT | CU,
+    CU_RT_NT = RT_NT | CU,
+    CU_NT_HT = NT_HT | CU,
+    CU_NT_WB = NT_WB | CU,
+    // Temporal Hint for SE
+    SE_RT    = RT | SE,
+    SE_NT    = NT | SE,
+    SE_HT    = HT | SE,
+    SE_LU    = LU | SE,
+    SE_WB    = WB | SE,
+    SE_NT_RT = NT_RT | SE,
+    SE_RT_NT = RT_NT | SE,
+    SE_NT_HT = NT_HT | SE,
+    SE_NT_WB = NT_WB | SE,
+    // Temporal Hint for DEVICE
+    DEVICE_RT    = RT | DEVICE,
+    DEVICE_NT    = NT | DEVICE,
+    DEVICE_HT    = HT | DEVICE,
+    DEVICE_LU    = LU | DEVICE,
+    DEVICE_WB    = WB | DEVICE,
+    DEVICE_NT_RT = NT_RT | DEVICE,
+    DEVICE_RT_NT = RT_NT | DEVICE,
+    DEVICE_NT_HT = NT_HT | DEVICE,
+    DEVICE_NT_WB = NT_WB | DEVICE,
+    // Temporal Hint for SYSTEM
+    SYSTEM_RT    = RT | SYSTEM,
+    SYSTEM_NT    = NT | SYSTEM,
+    SYSTEM_HT    = HT | SYSTEM,
+    SYSTEM_LU    = LU | SYSTEM,
+    SYSTEM_WB    = WB | SYSTEM,
+    SYSTEM_NT_RT = NT_RT | SYSTEM,
+    SYSTEM_RT_NT = RT_NT | SYSTEM,
+    SYSTEM_NT_HT = NT_HT | SYSTEM,
+    SYSTEM_NT_WB = NT_WB | SYSTEM,
+
+    // GFX942 and GFX950 compatiblity
+    GROUP_NT0  = CU_RT,
+    GROUP_NT1  = CU_NT,
+    DEVICE_NT0 = DEVICE_RT,
+    DEVICE_NT1 = DEVICE_NT,
+    SYSTEM_NT0 = SYSTEM_RT,
+    SYSTEM_NT1 = SYSTEM_NT,
+    // Other archs compatiblity
+    glc     = DEVICE_NT,
+    slc     = SYSTEM_NT,
+    glc_slc = DEVICE_NT | SYSTEM_NT,
+
+// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
+// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
+#elif defined(__gfx942__) || defined(__gfx950__)
+
+    WAVE   = 0,
+    GROUP  = 1,
+    DEVICE = 16,
+    SYSTEM = 17,
+    NT0    = 0,
+    NT1    = 2,
+
+    WAVE_NT0   = NT0 | WAVE,
+    WAVE_NT1   = NT1 | WAVE,
+    GROUP_NT0  = NT0 | GROUP,
+    GROUP_NT1  = NT1 | GROUP,
+    DEVICE_NT0 = NT0 | DEVICE,
+    DEVICE_NT1 = NT1 | DEVICE,
+    SYSTEM_NT0 = NT0 | SYSTEM,
+    SYSTEM_NT1 = NT1 | SYSTEM,
+
+    // Other archs compatiblity
+    glc     = DEVICE_NT1,
+    slc     = SYSTEM_NT1,
+    glc_slc = DEVICE_NT1 | SYSTEM_NT1,
+#else
+    glc     = 1,
+    slc     = 2,
+    glc_slc = 3,
+
+    // Other archs compatiblity
+    DEVICE_NT0 = 0,
+    SYSTEM_NT0 = 0,
+    DEVICE_NT1 = glc,
+    SYSTEM_NT1 = slc,
+#endif
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -31,7 +31,9 @@ struct BaseFlatmmPipelineAGmemBGmemCRegV1
        ck_tile::ignore = K;
        if(M <= 416)
        {
+#if defined(__gfx942__) || defined(__gfx950__)
            return ck_tile::amd_buffer_coherence_enum::WAVE_NT1;
+#endif
        }
        return ck_tile::amd_buffer_coherence_enum::coherence_default;
    }
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
@@ -545,6 +545,13 @@ struct GroupedConvolutionBackwardWeightKernel
            return false;
        }

+#if defined(__gfx11__)
+        if constexpr(EpiloguePipeline::MemoryOperation != ck_tile::memory_operation_enum::set)
+        {
+            return false;
+        }
+#endif
+
        if constexpr(EpiloguePipeline_::MemoryOperation == memory_operation_enum::atomic_add)
        {
            if(kargs.k_batch == 1)
@@ -971,6 +978,12 @@ struct GroupedConvolutionBackwardWeightKernel

    CK_TILE_DEVICE void operator()(GroupedConvBwdWeightKernelArgsSpecialized& kargs) const
    {
+#if defined(__gfx11__)
+        if constexpr(EpiloguePipeline::MemoryOperation != ck_tile::memory_operation_enum::set)
+        {
+            return;
+        }
+#endif
        if constexpr(GroupedConvTraitsType_::ExplicitGemm)
        {
            CallExplicitGemm(kargs);