From 226dcf33c82288e8651ca5ae337bb904ed96b724 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Fri, 9 May 2025 16:06:45 +0000
Subject: [PATCH] Merge commit '6b1a339b6faca7e423fdbce67a40a8fca7445abd' into
 develop

---
 example/65_gemm_multiply_multiply/CMakeLists.txt            | 2 +-
 example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp   | 2 +-
 ...ice_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 6 ++----
 3 files changed, 4 insertions(+), 6 deletions(-)
diff --git a/example/65_gemm_multiply_multiply/CMakeLists.txt b/example/65_gemm_multiply_multiply/CMakeLists.txt
index 5d2a097576..8d51d43c65 100644
--- a/example/65_gemm_multiply_multiply/CMakeLists.txt
+++ b/example/65_gemm_multiply_multiply/CMakeLists.txt
@@ -7,7 +7,7 @@ add_example_executable(example_gemm_multiply_multiply_xdl_int8 gemm_multiply_mul
 add_example_executable(example_moe_gemm1_xdl_fp8 moe_gemm1_xdl_fp8.cpp)
 add_example_executable(example_moe_gemm2_xdl_fp8 moe_gemm2_xdl_fp8.cpp)
 
-list(APPEND gpu_list gfx942)
+list(APPEND gpu_list gfx942 gfx950)
 set(target 0)
 foreach(gpu IN LISTS GPU_TARGETS)
     if(gpu IN_LIST gpu_list AND target EQUAL 0)
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
index b9621cc9b3..3745e3d0af 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
@@ -281,7 +281,7 @@ int main(int argc, char* argv[])
         break;
     case 4:
         a0_t_k_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
-        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
+        b0_e_n_k.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
         d0_t_n.GenerateTensorValue(GeneratorTensor_1<D0DataType>{});
         d1_e_n.GenerateTensorValue(GeneratorTensor_1<D1DataType>{});
         d2_e_n.GenerateTensorValue(GeneratorTensor_1<D2DataType>{});
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
index 3028cd7cbc..41f596d160 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -179,8 +179,7 @@ __global__ void
         const ComputePtrOffsetOfN compute_ptr_offset_of_n,
         const index_t num_k_per_block)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     // offset base pointer for each work-group
     const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
     const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / karg.KBatch);
@@ -251,8 +250,7 @@ __global__ void
         const ComputePtrOffsetOfN compute_ptr_offset_of_n,
         const index_t num_k_per_block)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
     const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
     const index_t n_idx = __builtin_amdgcn_readfirstlane(blockIdx.y / karg.KBatch);
     const index_t k_idx =