From 04c756ea9303468d9fd101e204297fb2093f91ca Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Wed, 25 Sep 2024 13:45:38 -0700 Subject: [PATCH] Fix compilation errors with Clang20.0. (#1533) * fix clang20 compilation errors for gfx90a * fix clang20 compilation errors for gfx11 targets [ROCm/composable_kernel commit: 42e6dceaccda48ec99eff93e181df49abef69c11] --- .../block/blockwise_gemm_pipeline_xdlops.hpp | 4 +- .../ck/tensor_operation/gpu/warp/dpp_gemm.hpp | 18 +++--- .../tensor_operation/gpu/warp/wmma_gemm.hpp | 12 ++-- .../tensor_operation/gpu/warp/xdlops_gemm.hpp | 58 +++++++++---------- ...block_fmha_bwd_pipeline_default_policy.hpp | 10 ++-- 5 files changed, 51 insertions(+), 51 deletions(-) diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops.hpp index 1121cc4550..438d7d8ac3 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops.hpp @@ -406,7 +406,7 @@ struct BlockwiseGemmXdlops_pipeline_v4 } template <> - __device__ static constexpr auto TailScheduler<1>() + __device__ constexpr auto TailScheduler<1>() { // schedule constexpr auto num_ds_read_inst = @@ -433,7 +433,7 @@ struct BlockwiseGemmXdlops_pipeline_v4 } template <> - __device__ static constexpr auto TailScheduler<2>() + __device__ constexpr auto TailScheduler<2>() { // schedule constexpr auto num_ds_read_inst = diff --git a/include/ck/tensor_operation/gpu/warp/dpp_gemm.hpp b/include/ck/tensor_operation/gpu/warp/dpp_gemm.hpp index a184431648..409bb9f674 100644 --- a/include/ck/tensor_operation/gpu/warp/dpp_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/dpp_gemm.hpp @@ -324,55 +324,55 @@ struct DppSelector static constexpr auto GetDpp(); template <> - static constexpr auto GetDpp() + constexpr auto GetDpp() { return DppInstr::dpp8_f16_8x32x2; } template <> - static constexpr auto GetDpp() + constexpr auto GetDpp() { return DppInstr::dpp8_f16_8x16x2; } template <> - static constexpr auto GetDpp() + constexpr auto GetDpp() { return DppInstr::dpp8_f16_16x16x2; } template <> - static constexpr auto GetDpp() + constexpr auto GetDpp() { return DppInstr::dpp8_f16_32x8x2; } template <> - static constexpr auto GetDpp() + constexpr auto GetDpp() { return DppInstr::dpp8_f16_1x32x2; } template <> - static constexpr auto GetDpp() + constexpr auto GetDpp() { return DppInstr::dpp8_f16_2x32x2; } template <> - static constexpr auto GetDpp() + constexpr auto GetDpp() { return DppInstr::dpp8_f16_2x16x2; } template <> - static constexpr auto GetDpp() + constexpr auto GetDpp() { return DppInstr::dpp8_f16_4x16x2; } template <> - static constexpr auto GetDpp() + constexpr auto GetDpp() { return DppInstr::dpp8_f16_4x32x2; } diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp index 9a9ebf5595..b435a2a129 100644 --- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp @@ -415,7 +415,7 @@ struct WmmaSelector static constexpr auto GetWmma(); template <> - static constexpr auto GetWmma() + constexpr auto GetWmma() { #ifdef __gfx12__ return WmmaInstr::wmma_f32_16x16x16_f16_gfx12; @@ -425,7 +425,7 @@ struct WmmaSelector } template <> - static constexpr auto GetWmma() + constexpr auto GetWmma() { #ifdef __gfx12__ return WmmaInstr::wmma_f32_16x16x16_bf16_gfx12; @@ -435,19 +435,19 @@ struct WmmaSelector } template <> - static constexpr auto GetWmma() + constexpr auto GetWmma() { return WmmaInstr::wmma_f16_16x16x16_f16; } template <> - static constexpr auto GetWmma() + constexpr auto GetWmma() { return WmmaInstr::wmma_bf16_16x16x16_bf16; } template <> - static constexpr auto GetWmma() + constexpr auto GetWmma() { #ifdef __gfx12__ return WmmaInstr::wmma_i32_16x16x16_iu8_gfx12; @@ -458,7 +458,7 @@ struct WmmaSelector #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 template <> - static constexpr auto GetWmma() + constexpr auto GetWmma() { return WmmaInstr::wmma_i32_16x16x16_iu4; } diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp index 835075b7f2..24fac91e22 100644 --- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp @@ -651,97 +651,97 @@ struct MfmaSelector static constexpr auto GetMfma(); template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f64_16x16x4f64; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_32x32x1xf32; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_32x32x1xf32; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_16x16x1xf32; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_4x4x1xf32; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_4x4x1xf32; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_32x32x2xf32; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_16x16x4xf32; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_32x32x4f16; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_32x32x4f16; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_32x32x8f16; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_16x16x16f16; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_16x16x4f16; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_4x4x4f16; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_4x4x4f16; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { #if defined(CK_USE_AMD_MFMA_BF16_1K_OP) return MfmaInstr::mfma_f32_32x32x8bf16_1k; @@ -751,7 +751,7 @@ struct MfmaSelector } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { #if defined(CK_USE_AMD_MFMA_BF16_1K_OP) return MfmaInstr::mfma_f32_16x16x16bf16_1k; @@ -762,72 +762,72 @@ struct MfmaSelector #if defined(CK_USE_AMD_MFMA_GFX940) template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_i32_32x32x16i8; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_i32_16x16x32i8; } #else template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_i32_32x32x8i8; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_i32_16x16x16i8; } #endif template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_32x32x16f8f8; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_16x16x32f8f8; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_32x32x16bf8bf8; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_16x16x32bf8bf8; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_32x32x16f8bf8; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_16x16x32f8bf8; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_32x32x16bf8f8; } template <> - static constexpr auto GetMfma() + constexpr auto GetMfma() { return MfmaInstr::mfma_f32_16x16x32bf8f8; } diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp index 9e1ab81125..8647a7d25a 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_pipeline_default_policy.hpp @@ -1727,7 +1727,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy } template <> - CK_TILE_DEVICE static constexpr void GemmStagedScheduler<0>() + CK_TILE_DEVICE constexpr void GemmStagedScheduler<0>() { // Mem: Q, LSE, OGrad, D global load, OGrad^T LDS load // Comp: Q x K @@ -1759,7 +1759,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy } template <> - CK_TILE_DEVICE static constexpr void GemmStagedScheduler<1>() + CK_TILE_DEVICE constexpr void GemmStagedScheduler<1>() { // Mem: Q^T LDS load // Comp: OGrad x V @@ -1777,7 +1777,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy } template <> - CK_TILE_DEVICE static constexpr void GemmStagedScheduler<2>() + CK_TILE_DEVICE constexpr void GemmStagedScheduler<2>() { // Mem: Q, QT, LSE, OGrad, OGradT, D, LDS store // Comp: PT x OGrad @@ -1796,7 +1796,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy } template <> - CK_TILE_DEVICE static constexpr void GemmStagedScheduler<3>() + CK_TILE_DEVICE constexpr void GemmStagedScheduler<3>() { // Mem: SGradT LDS store, SGrad, Q, LSE LDS load. // Comp: SGradT x QT @@ -1830,7 +1830,7 @@ struct BlockFmhaBwdPipelineDefaultPolicy } template <> - CK_TILE_DEVICE static constexpr void GemmStagedScheduler<4>() + CK_TILE_DEVICE constexpr void GemmStagedScheduler<4>() { // Mem: SGrad, OGrad, D LDS load. // Comp: SGrad x KT