Moe gemm activation (#2026)

* fix useless code and remove usless oob

* clang format

* fix coredump in e2e test

* fix2

* fix clang format

* fix output oob

* impl int64 but result not correct

* int64 index ok now

* input output all ok

* fix uint32

* revert v1 test

* use uint32

* mork to support 13w tokens

* moe sorting fix moebuf

* fix merge

* update moe api fix aiter build

* fix buid

* fuse silu

* silu ok

* acale ok

* add silu

* change code

* gemm2 ok

* gufusion compatible ok, fix warnings

* gu fusion for m32 m64 ok

* support bf16 cshuffle

* i4 gemm2 ok

* i4 gemm2 ok and i4 gemm1 build

* 16x16 run ok

* change flops; change cshuffle dtype

* fuse gelu silu act in moe gemm1

* fp8 with act ready

* int4 act ready

* remove useless changes

* remove useless code change

* fix clang format

* add the arch limit of int4 moe gemm

* fuse moe activation

* fix fp8 16x16

* fix no quant case

* fix bugs

* fix fp8 gufusion bug

* remove useless comments

* refine activation code & complete moe example

* fix int8 bugs

* merge tkw1

---------

Co-authored-by: coderfeli <coderfeli@163.com>
Co-authored-by: feli <felix.li@amd.com>
Co-authored-by: illsilin <Illia.Silin@amd.com>
Co-authored-by: root <root@hjbog-srdc-51.amd.com>
Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
This commit is contained in:
lalala-sh
2025-04-23 10:35:34 +08:00
committed by GitHub
parent 94662b02d0
commit 39ba03f25d
19 changed files with 1975 additions and 496 deletions

View File

@@ -65,9 +65,12 @@ template <typename ALayout,
typename CDEShuffleBlockTransferScalarPerVectors,
BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
index_t ActivationOP = 0,
bool NSwizzle = false,
bool IsInputGemm = true,
bool MulRoutedWeight = true,
bool PerTokenQuant = true,
typename IndexType = index_t,
typename ComputeTypeA = CDataType,
typename ComputeTypeB = ComputeTypeA,
typename LDSTypeA = ComputeTypeA,
@@ -132,7 +135,12 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
CDEShuffleBlockTransferScalarPerVectors,
BlkGemmPipeSched,
BlkGemmPipelineVer,
ActivationOP,
NSwizzle,
IsInputGemm,
MulRoutedWeight,
PerTokenQuant,
IndexType,
ComputeTypeA,
ComputeTypeB,
LDSTypeA,
@@ -247,10 +255,10 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
constexpr auto estimated_reg_a = MPerBlock * KPerBlock * sizeof(ADataType) / BlockSize /
4 * (1 + GridwiseGemm::NWave);
constexpr auto estimated_reg_b =
NPerBlock * KPerBlock * sizeof(BDataType) / BlockSize / 4 * (2);
constexpr auto estimated_reg_c =
MPerBlock * NPerBlock * sizeof(GemmAccDataType) / BlockSize / 4;
constexpr auto estimated_reg_b = NPerBlock * KPerBlock * sizeof(BDataType) / BlockSize /
4 * (2) * (IsInputGemm ? 2 : 1);
constexpr auto estimated_reg_c = MPerBlock * NPerBlock * sizeof(GemmAccDataType) /
BlockSize / 4 * (IsInputGemm ? 2 : 1);
constexpr auto estimated_reg_total =
estimated_reg_a + estimated_reg_b + estimated_reg_c;
@@ -270,8 +278,6 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
true,
MemoryDataOp,
minimum_occupancy,
IsInputGemm,
MulRoutedWeight,
TailNumber::Odd>;
RunKernel(kernel);
}
@@ -281,8 +287,6 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
true,
MemoryDataOp,
minimum_occupancy,
IsInputGemm,
MulRoutedWeight,
TailNumber::Even>;
RunKernel(kernel);
}
@@ -297,8 +301,6 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
true,
MemoryDataOp,
minimum_occupancy,
IsInputGemm,
MulRoutedWeight,
TailNumber::Odd>;
RunKernel(kernel);
}
@@ -308,8 +310,6 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
true,
MemoryDataOp,
minimum_occupancy,
IsInputGemm,
MulRoutedWeight,
TailNumber::Even>;
RunKernel(kernel);
}
@@ -329,8 +329,6 @@ struct DeviceMoeGemm : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
true,
InMemoryDataOperationEnum::Set,
minimum_occupancy,
IsInputGemm,
MulRoutedWeight,
TailNumber::Odd>;
RunKernel(kernel);
}