From 407bdf7eb0395757123aca007e4543606de7038f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= <barkocot@amd.com>
Date: Thu, 18 Dec 2025 10:16:22 +0100
Subject: [PATCH] Update AMD buffer coherency (#3403)

* Update AMD buffer coherency [AICK-421]

* fixes

* fix

* fixes

* fixes

* Add backward compatilibity

* fix

* fixes

* fix

* fix

* fix

* Update grouped_convolution_backward_weight_kernel.hpp

[ROCm/composable_kernel commit: 700b2ec9c02da8d367ebe8a223a6dbf16622db09]
---
 .../20_grouped_convolution/CMakeLists.txt     |   2 +-
 ...tion_backward_weight_two_stage_invoker.hpp |   5 +-
 include/ck/utility/amd_buffer_addressing.hpp  |  25 +---
 .../amd_buffer_addressing_builtins.hpp        |  25 +---
 include/ck/utility/amd_buffer_coherence.hpp   | 119 +++++++++++++++++
 include/ck_tile/core.hpp                      |   1 +
 .../core/arch/amd_buffer_addressing.hpp       |  25 +---
 .../arch/amd_buffer_addressing_builtins.hpp   |  25 +---
 .../core/arch/amd_buffer_coherence.hpp        | 124 ++++++++++++++++++
 .../flatmm_pipeline_agmem_bgmem_creg_v1.hpp   |   2 +
 ...ped_convolution_backward_weight_kernel.hpp |  13 ++
 11 files changed, 268 insertions(+), 98 deletions(-)
 create mode 100644 include/ck/utility/amd_buffer_coherence.hpp
 create mode 100644 include/ck_tile/core/arch/amd_buffer_coherence.hpp

diff --git a/example/ck_tile/20_grouped_convolution/CMakeLists.txt b/example/ck_tile/20_grouped_convolution/CMakeLists.txt
index 7fcca37bd9..090aae482b 100644
--- a/example/ck_tile/20_grouped_convolution/CMakeLists.txt
+++ b/example/ck_tile/20_grouped_convolution/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 
-if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a")
+if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a|gfx11|gfx12")
   set(EXAMPLE_CONV_COMPILE_OPTIONS)
   list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
 
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp
index 9221746560..ad5e8ae70f 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp
@@ -21,6 +21,9 @@ struct GroupedConvolutionBackwardWeightTwoStageInvoker
                                                  const ck_tile::stream_config& s)
     {
         using WorkspaceDataType = float;
+        // Force Vector Size C to 1 for two stage to check main
+        // two stage use case
+        constexpr ck_tile::index_t VectorSizeC = 1;
 
         // Implicit GEMM Traits
         using GemmShape = ck_tile::TileGemmShape<
@@ -39,7 +42,7 @@ struct GroupedConvolutionBackwardWeightTwoStageInvoker
                                                                  OutLayout,
                                                                  ConvConfig::VectorSizeA,
                                                                  ConvConfig::VectorSizeB,
-                                                                 ConvConfig::VectorSizeC,
+                                                                 VectorSizeC,
                                                                  ConvConfig::NumGroupsToMerge>;
 
         using TilePartitioner = ck_tile::GemmSpatiallyLocalTilePartitioner<
diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp
index f9404e00b7..3843c0c301 100644
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -3,6 +3,7 @@
 
 #pragma once
 #include "data_type.hpp"
+#include "ck/utility/amd_buffer_coherence.hpp"
 
 namespace ck {
 
@@ -286,30 +287,6 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
                                        int soffset,    // dst_wave_addr_offset
                                        int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");
 
-// memory coherency bit for buffer store/load instruction
-// check ISA manual for each GFX target
-// e.g. for
-// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
-// page 67~68
-enum struct AmdBufferCoherenceEnum
-{
-    DefaultCoherence = 0, // default value
-    GLC              = 1,
-    SLC              = 2,
-    GLC_SLC          = 3,
-    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
-    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
-    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
-    WAVE_NT0   = 0,
-    WAVE_NT1   = 2,
-    GROUP_NT0  = 1,
-    GROUP_NT1  = 3,
-    DEVICE_NT0 = 8,
-    DEVICE_NT1 = 10,
-    SYSTEM_NT0 = 9,
-    SYSTEM_NT1 = 11,
-};
-
 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
 __device__ typename vector_type<int8_t, N>::type
 amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource,
diff --git a/include/ck/utility/amd_buffer_addressing_builtins.hpp b/include/ck/utility/amd_buffer_addressing_builtins.hpp
index cddb8b7e5c..f4ea9c80f1 100644
--- a/include/ck/utility/amd_buffer_addressing_builtins.hpp
+++ b/include/ck/utility/amd_buffer_addressing_builtins.hpp
@@ -3,6 +3,7 @@
 
 #pragma once
 #include "data_type.hpp"
+#include "ck/utility/amd_buffer_coherence.hpp"
 
 namespace ck {
 
@@ -106,30 +107,6 @@ llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
                                        int soffset,    // dst_wave_addr_offset
                                        int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");
 
-// memory coherency bit for buffer store/load instruction
-// check ISA manual for each GFX target
-// e.g. for
-// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
-// page 67~68
-enum struct AmdBufferCoherenceEnum
-{
-    DefaultCoherence = 0, // default value
-    GLC              = 1,
-    SLC              = 2,
-    GLC_SLC          = 3,
-    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
-    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
-    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
-    WAVE_NT0   = 0,
-    WAVE_NT1   = 2,
-    GROUP_NT0  = 1,
-    GROUP_NT1  = 3,
-    DEVICE_NT0 = 8,
-    DEVICE_NT1 = 10,
-    SYSTEM_NT0 = 9,
-    SYSTEM_NT1 = 11,
-};
-
 template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
 __device__ typename vector_type<int8_t, N>::type
 amd_buffer_load_impl_raw(__amdgpu_buffer_rsrc_t src_wave_buffer_resource,
diff --git a/include/ck/utility/amd_buffer_coherence.hpp b/include/ck/utility/amd_buffer_coherence.hpp
new file mode 100644
index 0000000000..361f6d7b41
--- /dev/null
+++ b/include/ck/utility/amd_buffer_coherence.hpp
@@ -0,0 +1,119 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+namespace ck {
+
+enum struct AmdBufferCoherenceEnum
+{
+    DefaultCoherence = 0, // default value
+#if defined(__gfx12__)
+    // Temporal hint
+    RT    = 0, // regular temporal
+    NT    = 1, // non temporal
+    HT    = 2, // high priority temporal
+    LU    = 3, // last use (load op)
+    WB    = 3, // same as HT, overrides WR in far cache (store op)
+    NT_RT = 4, // non temporal for near cache, regular for far cache
+    RT_NT = 5, // regular for near cache, non-temporal for far cache
+    NT_HT = 6, // non temporal for near cache, high priority for far cache
+    NT_WB = 7, // non temporal for near cache, WB for far cache
+               // (store op, reserved for load op)
+    // Scope
+    CU     = 0,
+    SE     = 8,
+    DEVICE = 16,
+    SYSTEM = 24,
+    // Temporal Hint for CU
+    CU_RT    = RT | CU,
+    CU_NT    = NT | CU,
+    CU_HT    = HT | CU,
+    CU_LU    = LU | CU,
+    CU_WB    = WB | CU,
+    CU_NT_RT = NT_RT | CU,
+    CU_RT_NT = RT_NT | CU,
+    CU_NT_HT = NT_HT | CU,
+    CU_NT_WB = NT_WB | CU,
+    // Temporal Hint for SE
+    SE_RT    = RT | SE,
+    SE_NT    = NT | SE,
+    SE_HT    = HT | SE,
+    SE_LU    = LU | SE,
+    SE_WB    = WB | SE,
+    SE_NT_RT = NT_RT | SE,
+    SE_RT_NT = RT_NT | SE,
+    SE_NT_HT = NT_HT | SE,
+    SE_NT_WB = NT_WB | SE,
+    // Temporal Hint for DEVICE
+    DEVICE_RT    = RT | DEVICE,
+    DEVICE_NT    = NT | DEVICE,
+    DEVICE_HT    = HT | DEVICE,
+    DEVICE_LU    = LU | DEVICE,
+    DEVICE_WB    = WB | DEVICE,
+    DEVICE_NT_RT = NT_RT | DEVICE,
+    DEVICE_RT_NT = RT_NT | DEVICE,
+    DEVICE_NT_HT = NT_HT | DEVICE,
+    DEVICE_NT_WB = NT_WB | DEVICE,
+    // Temporal Hint for SYSTEM
+    SYSTEM_RT    = RT | SYSTEM,
+    SYSTEM_NT    = NT | SYSTEM,
+    SYSTEM_HT    = HT | SYSTEM,
+    SYSTEM_LU    = LU | SYSTEM,
+    SYSTEM_WB    = WB | SYSTEM,
+    SYSTEM_NT_RT = NT_RT | SYSTEM,
+    SYSTEM_RT_NT = RT_NT | SYSTEM,
+    SYSTEM_NT_HT = NT_HT | SYSTEM,
+    SYSTEM_NT_WB = NT_WB | SYSTEM,
+
+    // GFX942 and GFX950 compatiblity
+    GROUP_NT0  = CU_RT,
+    GROUP_NT1  = CU_NT,
+    DEVICE_NT0 = DEVICE_RT,
+    DEVICE_NT1 = DEVICE_NT,
+    SYSTEM_NT0 = SYSTEM_RT,
+    SYSTEM_NT1 = SYSTEM_NT,
+    // Other archs compatiblity
+    GLC     = DEVICE_NT,
+    SLC     = SYSTEM_NT,
+    GLC_SLC = DEVICE_NT | SYSTEM_NT,
+
+// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
+// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
+#elif defined(__gfx942__) || defined(__gfx950__)
+
+    WAVE   = 0,
+    GROUP  = 1,
+    DEVICE = 16,
+    SYSTEM = 17,
+    NT0    = 0,
+    NT1    = 2,
+
+    WAVE_NT0   = NT0 | WAVE,
+    WAVE_NT1   = NT1 | WAVE,
+    GROUP_NT0  = NT0 | GROUP,
+    GROUP_NT1  = NT1 | GROUP,
+    DEVICE_NT0 = NT0 | DEVICE,
+    DEVICE_NT1 = NT1 | DEVICE,
+    SYSTEM_NT0 = NT0 | SYSTEM,
+    SYSTEM_NT1 = NT1 | SYSTEM,
+
+    // Other archs compatiblity
+    GLC     = DEVICE_NT1,
+    SLC     = SYSTEM_NT1,
+    GLC_SLC = DEVICE_NT1 | SYSTEM_NT1,
+#else
+    GLC     = 1,
+    SLC     = 2,
+    GLC_SLC = 3,
+
+    // Other archs compatiblity
+    DEVICE_NT0 = 0,
+    SYSTEM_NT0 = 0,
+    DEVICE_NT1 = GLC,
+    SYSTEM_NT1 = SLC,
+#endif
+};
+
+} // namespace ck
diff --git a/include/ck_tile/core.hpp b/include/ck_tile/core.hpp
index d28d29a0ef..01e1d00b59 100644
--- a/include/ck_tile/core.hpp
+++ b/include/ck_tile/core.hpp
@@ -9,6 +9,7 @@
 #include "ck_tile/core/algorithm/static_encoding_pattern.hpp"
 #include "ck_tile/core/arch/amd_buffer_addressing.hpp"
 #include "ck_tile/core/arch/amd_buffer_addressing_builtins.hpp"
+#include "ck_tile/core/arch/amd_buffer_coherence.hpp"
 #include "ck_tile/core/arch/amd_transpose_load_encoding.hpp"
 #include "ck_tile/core/arch/arch.hpp"
 #include "ck_tile/core/arch/generic_memory_space_atomic.hpp"
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 9f79bdbee6..7af2f558ad 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -16,6 +16,7 @@
 #include "ck_tile/core/utility/bit_cast.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/ignore.hpp"
+#include "ck_tile/core/arch/amd_buffer_coherence.hpp"
 
 // This attribute gives a hint to the compiler that a branch is likely to be taken.
 // Then, the compiler should remove if possible the associated s_cbranch_execz branch that would
@@ -1409,30 +1410,6 @@ CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
     asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
 }
 
-// memory coherency bit for buffer store/load instruction
-// check ISA manual for each GFX target
-// e.g. for
-// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
-// page 67~68
-enum struct amd_buffer_coherence_enum
-{
-    coherence_default = 0, // default value
-    glc               = 1,
-    slc               = 2,
-    glc_slc           = 3,
-    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
-    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
-    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
-    WAVE_NT0   = 0,
-    WAVE_NT1   = 2,
-    GROUP_NT0  = 1,
-    GROUP_NT1  = 3,
-    DEVICE_NT0 = 16,
-    DEVICE_NT1 = 18,
-    SYSTEM_NT0 = 17,
-    SYSTEM_NT1 = 19,
-};
-
 template <index_t N,
           amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
 CK_TILE_DEVICE thread_buffer<int8_t, N>
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
index 4627b249d6..562b246ac3 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -16,6 +16,7 @@
 #include "ck_tile/core/utility/bit_cast.hpp"
 #include "ck_tile/core/utility/functional.hpp"
 #include "ck_tile/core/utility/ignore.hpp"
+#include "ck_tile/core/arch/amd_buffer_coherence.hpp"
 
 using as3_uint32_ptr = uint32_t __attribute__((address_space(3)))*;
 
@@ -1277,30 +1278,6 @@ CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt = 0)
     asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
 }
 
-// memory coherency bit for buffer store/load instruction
-// check ISA manual for each GFX target
-// e.g. for
-// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
-// page 67~68
-enum struct amd_buffer_coherence_enum
-{
-    coherence_default = 0, // default value
-    glc               = 1,
-    slc               = 2,
-    glc_slc           = 3,
-    // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
-    // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
-    // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
-    WAVE_NT0   = 0,
-    WAVE_NT1   = 2,
-    GROUP_NT0  = 1,
-    GROUP_NT1  = 3,
-    DEVICE_NT0 = 16,
-    DEVICE_NT1 = 18,
-    SYSTEM_NT0 = 17,
-    SYSTEM_NT1 = 19,
-};
-
 template <index_t N,
           amd_buffer_coherence_enum coherence = amd_buffer_coherence_enum::coherence_default>
 CK_TILE_DEVICE thread_buffer<int8_t, N>
diff --git a/include/ck_tile/core/arch/amd_buffer_coherence.hpp b/include/ck_tile/core/arch/amd_buffer_coherence.hpp
new file mode 100644
index 0000000000..2e38c77291
--- /dev/null
+++ b/include/ck_tile/core/arch/amd_buffer_coherence.hpp
@@ -0,0 +1,124 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+namespace ck_tile {
+
+// memory coherency bit for buffer store/load instruction
+// check ISA manual for each GFX target
+// e.g. for
+// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
+// page 67~68
+enum struct amd_buffer_coherence_enum
+{
+    coherence_default = 0, // default value
+#if defined(__gfx12__)
+    // Temporal hint
+    RT    = 0, // regular temporal
+    NT    = 1, // non temporal
+    HT    = 2, // high priority temporal
+    LU    = 3, // last use (load op)
+    WB    = 3, // same as HT, overrides WR in far cache (store op)
+    NT_RT = 4, // non temporal for near cache, regular for far cache
+    RT_NT = 5, // regular for near cache, non-temporal for far cache
+    NT_HT = 6, // non temporal for near cache, high priority for far cache
+    NT_WB = 7, // non temporal for near cache, WB for far cache
+               // (store op, reserved for load op)
+    // Scope
+    CU     = 0,
+    SE     = 8,
+    DEVICE = 16,
+    SYSTEM = 24,
+    // Temporal Hint for CU
+    CU_RT    = RT | CU,
+    CU_NT    = NT | CU,
+    CU_HT    = HT | CU,
+    CU_LU    = LU | CU,
+    CU_WB    = WB | CU,
+    CU_NT_RT = NT_RT | CU,
+    CU_RT_NT = RT_NT | CU,
+    CU_NT_HT = NT_HT | CU,
+    CU_NT_WB = NT_WB | CU,
+    // Temporal Hint for SE
+    SE_RT    = RT | SE,
+    SE_NT    = NT | SE,
+    SE_HT    = HT | SE,
+    SE_LU    = LU | SE,
+    SE_WB    = WB | SE,
+    SE_NT_RT = NT_RT | SE,
+    SE_RT_NT = RT_NT | SE,
+    SE_NT_HT = NT_HT | SE,
+    SE_NT_WB = NT_WB | SE,
+    // Temporal Hint for DEVICE
+    DEVICE_RT    = RT | DEVICE,
+    DEVICE_NT    = NT | DEVICE,
+    DEVICE_HT    = HT | DEVICE,
+    DEVICE_LU    = LU | DEVICE,
+    DEVICE_WB    = WB | DEVICE,
+    DEVICE_NT_RT = NT_RT | DEVICE,
+    DEVICE_RT_NT = RT_NT | DEVICE,
+    DEVICE_NT_HT = NT_HT | DEVICE,
+    DEVICE_NT_WB = NT_WB | DEVICE,
+    // Temporal Hint for SYSTEM
+    SYSTEM_RT    = RT | SYSTEM,
+    SYSTEM_NT    = NT | SYSTEM,
+    SYSTEM_HT    = HT | SYSTEM,
+    SYSTEM_LU    = LU | SYSTEM,
+    SYSTEM_WB    = WB | SYSTEM,
+    SYSTEM_NT_RT = NT_RT | SYSTEM,
+    SYSTEM_RT_NT = RT_NT | SYSTEM,
+    SYSTEM_NT_HT = NT_HT | SYSTEM,
+    SYSTEM_NT_WB = NT_WB | SYSTEM,
+
+    // GFX942 and GFX950 compatiblity
+    GROUP_NT0  = CU_RT,
+    GROUP_NT1  = CU_NT,
+    DEVICE_NT0 = DEVICE_RT,
+    DEVICE_NT1 = DEVICE_NT,
+    SYSTEM_NT0 = SYSTEM_RT,
+    SYSTEM_NT1 = SYSTEM_NT,
+    // Other archs compatiblity
+    glc     = DEVICE_NT,
+    slc     = SYSTEM_NT,
+    glc_slc = DEVICE_NT | SYSTEM_NT,
+
+// gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
+// SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
+// NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
+#elif defined(__gfx942__) || defined(__gfx950__)
+
+    WAVE   = 0,
+    GROUP  = 1,
+    DEVICE = 16,
+    SYSTEM = 17,
+    NT0    = 0,
+    NT1    = 2,
+
+    WAVE_NT0   = NT0 | WAVE,
+    WAVE_NT1   = NT1 | WAVE,
+    GROUP_NT0  = NT0 | GROUP,
+    GROUP_NT1  = NT1 | GROUP,
+    DEVICE_NT0 = NT0 | DEVICE,
+    DEVICE_NT1 = NT1 | DEVICE,
+    SYSTEM_NT0 = NT0 | SYSTEM,
+    SYSTEM_NT1 = NT1 | SYSTEM,
+
+    // Other archs compatiblity
+    glc     = DEVICE_NT1,
+    slc     = SYSTEM_NT1,
+    glc_slc = DEVICE_NT1 | SYSTEM_NT1,
+#else
+    glc     = 1,
+    slc     = 2,
+    glc_slc = 3,
+
+    // Other archs compatiblity
+    DEVICE_NT0 = 0,
+    SYSTEM_NT0 = 0,
+    DEVICE_NT1 = glc,
+    SYSTEM_NT1 = slc,
+#endif
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
index e4f186dead..8ad99e9399 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -31,7 +31,9 @@ struct BaseFlatmmPipelineAGmemBGmemCRegV1
         ck_tile::ignore = K;
         if(M <= 416)
         {
+#if defined(__gfx942__) || defined(__gfx950__)
             return ck_tile::amd_buffer_coherence_enum::WAVE_NT1;
+#endif
         }
         return ck_tile::amd_buffer_coherence_enum::coherence_default;
     }
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
index 2e80ff64c1..1004ed81b1 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
@@ -545,6 +545,13 @@ struct GroupedConvolutionBackwardWeightKernel
             return false;
         }
 
+#if defined(__gfx11__)
+        if constexpr(EpiloguePipeline::MemoryOperation != ck_tile::memory_operation_enum::set)
+        {
+            return false;
+        }
+#endif
+
         if constexpr(EpiloguePipeline_::MemoryOperation == memory_operation_enum::atomic_add)
         {
             if(kargs.k_batch == 1)
@@ -971,6 +978,12 @@ struct GroupedConvolutionBackwardWeightKernel
 
     CK_TILE_DEVICE void operator()(GroupedConvBwdWeightKernelArgsSpecialized& kargs) const
     {
+#if defined(__gfx11__)
+        if constexpr(EpiloguePipeline::MemoryOperation != ck_tile::memory_operation_enum::set)
+        {
+            return;
+        }
+#endif
         if constexpr(GroupedConvTraitsType_::ExplicitGemm)
         {
             CallExplicitGemm(kargs);