From 8c72adabeb1f8e0ef2e94972bcc500722c7bbb4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Wed, 14 Jan 2026 21:37:12 +0100 Subject: [PATCH] Disable ActiveWorkgroupsPerCU for different arch in wmma kernels (#3566) [ROCm/composable_kernel commit: a346cfa9607b6b334f99c8e32318cb29b81203dd] --- .../impl/device_batched_gemm_multiple_d_wmma_cshuffle_v3.hpp | 4 ++++ ...ice_grouped_conv_bwd_weight_two_stage_wmma_cshuffle_v3.hpp | 4 ++++ .../impl/device_grouped_conv_bwd_weight_wmma_cshuffle_v3.hpp | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_wmma_cshuffle_v3.hpp index 2a1a210398..126d107725 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_wmma_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_wmma_cshuffle_v3.hpp @@ -314,6 +314,10 @@ struct DeviceBatchedGemmMultiD_Wmma_CShuffleV3 { ActiveWorkgroupsPerCU() { + if(!ck::is_gfx11_supported() && !ck::is_gfx12_supported()) + { + return; + } constexpr int dynamic_smem_size = 0; int max_occupancy = 0; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_wmma_cshuffle_v3.hpp index 843705692b..f9b2ff0596 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_wmma_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_wmma_cshuffle_v3.hpp @@ -466,6 +466,10 @@ struct DeviceGroupedConvBwdWeightTwoStage_Wmma_CShuffleV3 { ActiveWorkgroupsPerCU() { + if(!ck::is_gfx11_supported() && !ck::is_gfx12_supported()) + { + return; + } constexpr int dynamic_smem_size = 0; constexpr index_t minimum_occupancy = BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave ? 1 : 2; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle_v3.hpp index c070d8d9e9..3f8093afe1 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle_v3.hpp @@ -415,6 +415,10 @@ struct DeviceGroupedConvBwdWeight_Wmma_CShuffleV3 { ActiveWorkgroupsPerCU() { + if(!ck::is_gfx11_supported() && !ck::is_gfx12_supported()) + { + return; + } constexpr int dynamic_smem_size = 0; constexpr index_t minimum_occupancy = BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave ? 1 : 2;