From 31b9e40b3d11c7e2ec70c43f039d27fa3d9451ce Mon Sep 17 00:00:00 2001 From: carlushuang Date: Thu, 6 Mar 2025 12:01:25 +0800 Subject: [PATCH] [CK_TILE][HOTFIX] WA for address space by disable it completely (#1947) * port all moe changes from ck_moe_gemm branch * refine codes in the pr * fix tail odd * fix clang format * fix clang format2 * make hot loop scheduler compatible with 16x16 and 32x32 * clang format * fix per token quant * rename moe example * clang format * WA for address space by disable it completely * hot fix moe gemm2 --------- Co-authored-by: coderfeli Co-authored-by: feli [ROCm/composable_kernel commit: c12fb0a624e4d56d4438d1241e5d05a2cbfba9e4] --- include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp | 4 ++-- include/ck_tile/core/config.hpp | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp index d0e06a6c53..5337fd5e2c 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp @@ -1492,7 +1492,7 @@ struct GridwiseMoeGemm using CDEBlockTransferCluster = CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock; const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation; - constexpr index_t scatter_weight_idx = 1; + constexpr index_t scatter_weight_idx = IsInputGemm ? 1 : 3; // hack fix felix auto cde_block_copy_lds_and_global = ThreadGroupTensorSliceTransfer_v7r3_scatter< ThisThreadBlock, decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})), @@ -2000,7 +2000,7 @@ struct GridwiseMoeGemm using CDEBlockTransferCluster = CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock; const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation; - constexpr index_t scatter_weight_idx = 1; + constexpr index_t scatter_weight_idx = IsInputGemm ? 1 : 3; // hack fix felix auto cde_block_copy_lds_and_global = ThreadGroupTensorSliceTransfer_v7r3_scatter< ThisThreadBlock, decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})), diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp index 7ccac5bd5b..aaaf4d4259 100644 --- a/include/ck_tile/core/config.hpp +++ b/include/ck_tile/core/config.hpp @@ -50,7 +50,8 @@ CK_TILE_DECLARE_ENV_VAR_BOOL(CK_TILE_LOGGING) // implementing the "memory address space" attribute // https://llvm.org/docs/AMDGPUUsage.html#amdgpu-address-spaces-table -#ifdef __HIPCC__ +// WA for https://github.com/ROCm/composable_kernel/issues/1946 +#if 0 #define CK_TILE_GENERIC_ADDR __attribute__((address_space(0))) #define CK_TILE_GLOBAL_ADDR __attribute__((address_space(1))) #define CK_TILE_LDS_ADDR __attribute__((address_space(3)))