[CK_TILE] Blockwise GEMM pipeline v6 - port of v5 from old CK (#2955)

* First checkpoint

* Second checkpoint - hot loop scheduler

* Third checkpoint - init main operator

* Fourth checkpoint - main loop ready

* Fifth checkpoint - main loop fix

* Sixth checkpoint - ReadWritecompFunc

* Seventh checkpoint - Tail finished

* [CK_TILE] Blockwise gemm pipeline v5 complete

* Working

* Working fixes 2

* Rename v5 to v77 temporarily

* Data type adjustment

* Data type adjustment 2

* [CK_TILE] Blockwise Gemm pipeline v5 add tests

* [CK_TILE] Fix calculation error

* TEMP: check pipeline

* Fix name to V6

* naming and documentation changes

* WIP dump

* Try fixing v1

* Failing tests v5

* Debugging

* Changes v2

* F16 tests working great

* Working BlockwiseGemmPipelineV5 as V6

* Cleanup and format

* Merging changes part1

* [CK_TILE] Blockwise Gemm Pipeline Comp V5/V6

* Remove commented code

* Fix gfx950 build issues

* Fix file formatting

* Review changes, more concat info, add bf16 bf8 tests

* Fix formatting

* Add bf16 and bf8 tests

---------

Co-authored-by: Adam Osewski <Adam.Osewski@amd.com>
This commit is contained in:
aledudek
2025-10-13 13:57:37 +02:00
committed by GitHub
parent 3021604213
commit 634634f5c0
8 changed files with 924 additions and 12 deletions

View File

@@ -16,8 +16,9 @@
#define CK_TILE_PIPELINE_MEMORY 2
#define CK_TILE_PIPELINE_COMPUTE_V4 3
#define CK_TILE_PIPELINE_COMPUTE_V5 4
#define CK_TILE_PIPELINE_PRESHUFFLE_V1 5
#define CK_TILE_PIPELINE_PRESHUFFLE_V2 6
#define CK_TILE_PIPELINE_COMPUTE_V6 5
#define CK_TILE_PIPELINE_PRESHUFFLE_V1 6
#define CK_TILE_PIPELINE_PRESHUFFLE_V2 7
template <typename PrecType, ck_tile::index_t M_Warp_Tile>
constexpr ck_tile::index_t get_k_warp_tile()
@@ -251,9 +252,29 @@ struct GemmConfigComputeV5 : public GemmConfigBase
static constexpr ck_tile::index_t N_Warp_Tile = 32;
static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
static constexpr bool DoubleSmemBuffer = false;
static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V5;
static constexpr ck_tile::index_t NumWaNumWaveGroups = 2;
static constexpr bool DoubleSmemBuffer = false;
static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V5;
static constexpr ck_tile::index_t NumWaveGroups = 2;
};
template <typename PrecType>
struct GemmConfigComputeV6 : public GemmConfigBase
{
static constexpr ck_tile::index_t M_Tile = 256;
static constexpr ck_tile::index_t N_Tile = 256;
static constexpr ck_tile::index_t K_Tile = 32;
static constexpr ck_tile::index_t M_Warp = 2;
static constexpr ck_tile::index_t N_Warp = 2;
static constexpr ck_tile::index_t K_Warp = 1;
static constexpr ck_tile::index_t M_Warp_Tile = 32;
static constexpr ck_tile::index_t N_Warp_Tile = 32;
static constexpr ck_tile::index_t K_Warp_Tile = 16;
static constexpr bool DoubleSmemBuffer = false;
static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V6;
static constexpr ck_tile::index_t NumWaveGroups = 1;
};
template <typename PrecType>
@@ -484,6 +505,15 @@ struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V5>
using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV5<PipelineProblem>;
};
template <>
struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V6>
{
template <typename PipelineProblem>
using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV6<PipelineProblem>;
template <typename PipelineProblem>
using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV6<PipelineProblem>;
};
template <>
struct PipelineTypeTraits<CK_TILE_PIPELINE_PRESHUFFLE_V1>
{