[CK_TILE][HOTFIX] WA for address space by disable it completely (#1947)

* port all moe changes from ck_moe_gemm branch

* refine codes in the pr

* fix tail odd

* fix clang format

* fix clang format2

* make hot loop scheduler compatible with 16x16 and 32x32

* clang format

* fix per token quant

* rename moe example

* clang format

* WA for address space by disable it completely

* hot fix moe gemm2

---------

Co-authored-by: coderfeli <coderfeli@163.com>
Co-authored-by: feli <felix.li@amd.com>

[ROCm/composable_kernel commit: c12fb0a624]
This commit is contained in:
carlushuang
2025-03-06 12:01:25 +08:00
committed by GitHub
parent 9d24409070
commit 31b9e40b3d
2 changed files with 4 additions and 3 deletions

View File

@@ -1492,7 +1492,7 @@ struct GridwiseMoeGemm
using CDEBlockTransferCluster =
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
constexpr index_t scatter_weight_idx = 1;
constexpr index_t scatter_weight_idx = IsInputGemm ? 1 : 3; // hack fix felix
auto cde_block_copy_lds_and_global = ThreadGroupTensorSliceTransfer_v7r3_scatter<
ThisThreadBlock,
decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
@@ -2000,7 +2000,7 @@ struct GridwiseMoeGemm
using CDEBlockTransferCluster =
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
constexpr index_t scatter_weight_idx = 1;
constexpr index_t scatter_weight_idx = IsInputGemm ? 1 : 3; // hack fix felix
auto cde_block_copy_lds_and_global = ThreadGroupTensorSliceTransfer_v7r3_scatter<
ThisThreadBlock,
decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),

View File

@@ -50,7 +50,8 @@ CK_TILE_DECLARE_ENV_VAR_BOOL(CK_TILE_LOGGING)
// implementing the "memory address space" attribute
// https://llvm.org/docs/AMDGPUUsage.html#amdgpu-address-spaces-table
#ifdef __HIPCC__
// WA for https://github.com/ROCm/composable_kernel/issues/1946
#if 0
#define CK_TILE_GENERIC_ADDR __attribute__((address_space(0)))
#define CK_TILE_GLOBAL_ADDR __attribute__((address_space(1)))
#define CK_TILE_LDS_ADDR __attribute__((address_space(3)))