diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp index d0e06a6c53..5337fd5e2c 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp @@ -1492,7 +1492,7 @@ struct GridwiseMoeGemm using CDEBlockTransferCluster = CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock; const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation; - constexpr index_t scatter_weight_idx = 1; + constexpr index_t scatter_weight_idx = IsInputGemm ? 1 : 3; // hack fix felix auto cde_block_copy_lds_and_global = ThreadGroupTensorSliceTransfer_v7r3_scatter< ThisThreadBlock, decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})), @@ -2000,7 +2000,7 @@ struct GridwiseMoeGemm using CDEBlockTransferCluster = CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock; const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation; - constexpr index_t scatter_weight_idx = 1; + constexpr index_t scatter_weight_idx = IsInputGemm ? 1 : 3; // hack fix felix auto cde_block_copy_lds_and_global = ThreadGroupTensorSliceTransfer_v7r3_scatter< ThisThreadBlock, decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})), diff --git a/include/ck_tile/core/config.hpp b/include/ck_tile/core/config.hpp index 7ccac5bd5b..aaaf4d4259 100644 --- a/include/ck_tile/core/config.hpp +++ b/include/ck_tile/core/config.hpp @@ -50,7 +50,8 @@ CK_TILE_DECLARE_ENV_VAR_BOOL(CK_TILE_LOGGING) // implementing the "memory address space" attribute // https://llvm.org/docs/AMDGPUUsage.html#amdgpu-address-spaces-table -#ifdef __HIPCC__ +// WA for https://github.com/ROCm/composable_kernel/issues/1946 +#if 0 #define CK_TILE_GENERIC_ADDR __attribute__((address_space(0))) #define CK_TILE_GLOBAL_ADDR __attribute__((address_space(1))) #define CK_TILE_LDS_ADDR __attribute__((address_space(3)))