generalized bpreshuffle pipeline optimization

This commit is contained in:
aska-0096
2025-04-27 11:50:30 +00:00
parent 49338edb1b
commit bc9c819aa4
2 changed files with 148 additions and 11 deletions

View File

@@ -132,6 +132,7 @@ static constexpr ck::index_t CShuffleMLane = BLOCKSIZE / CShuffleNLane;
static constexpr ck::index_t AK1 = 16 / sizeof(A0DataType);
static constexpr ck::index_t BK1 = 16 / sizeof(B0DataType);
static constexpr ck::index_t EVec = 2;
// TODO: Epilogue performance issue. AtomicAdd lose 15~20% performance compare with Set.
static constexpr ck::index_t D0Vec = 1;
static constexpr ck::index_t D1Vec = 1;
static constexpr ck::index_t D2Vec = 1;