From f1aec610f1e30347673f8b310c911ca7f14a005e Mon Sep 17 00:00:00 2001 From: kyle-256 Date: Tue, 9 Sep 2025 01:25:57 +0800 Subject: [PATCH] [CK_TILE] Implement Row/Col quant grouped gemm (#2786) * Add cshuffle epilogue test * add the poc implementation to the epilogue and tests * refactor cshuffle epilogue * WIP: adding tensor/tile usage to scale_tile * fix usage of tile_elementwise_inout * add gemm_quant_kernel for generalizing gemm quant kernel * Add problem specific to different quants, add QuantType to Traits * Add quant_type to quant_kernel template parameters * Create aq/bq_block_windows and views depending on QuantType * Use tile windows as inputs in cshuffle epilogue * Fix some issues in epilogue * initial new example code for new general gemm quant kernel test * Fix issues in kernel * Add verification check for rowcol Quantmode * use AccDataType instead of AQ in pipeline * fix aquant preshuffle * fix formatting * some cleanup * remove gemm_aquant_basic.cpp * remove gemm_aquant_kernel.hpp * fix tests for the renamed quant kernel * fix formatting * clean example files * fix some merge conflicts * fix preshufflequant rename issue * updating * fix some templates after merging with develop * fix test preshuffle parameter * fix formatting * updating kernels * change update user * test username * update quant_grouped_gemm example * update example * Unify bquant kernel to the common quant kernel * remove bquant kernel also from common header * fix formatting * clean up commented code * update grouped_gemm_quant example * fix formatting config hpp * fix merge mistake * Non-const for movable windows * fix formatting * update tileloop pipleline * Fix grammar in README Co-authored-by: spolifroni-amd * Remove #include and clean up example * fix strides * Add some descriptions for move_windows * fix tensor print bug * update quant_grouped_gemm example * remove useless code * cleanup code * clean up code & format code * fix compile & running bug in grouped_gemm example --------- Co-authored-by: Sami Remes Co-authored-by: Mohsen Saffari Co-authored-by: liyingli Co-authored-by: kyle-256 Co-authored-by: spolifroni-amd [ROCm/composable_kernel commit: 4eb415829e7a60ad6c7e65a59f9c02e290d1a0a7] --- .../ck_tile/17_grouped_gemm/CMakeLists.txt | 1 + example/ck_tile/17_grouped_gemm/README.md | 2 + .../ck_tile/17_grouped_gemm/grouped_gemm.cpp | 30 ++ .../17_grouped_gemm/quant_grouped_gemm.cpp | 136 ++++++ .../17_grouped_gemm/quant_grouped_gemm.hpp | 157 +++++++ .../quant_run_grouped_gemm_example.inc | 443 ++++++++++++++++++ .../run_grouped_gemm_example.inc | 2 +- .../ck_tile/core/tensor/tensor_descriptor.hpp | 12 +- include/ck_tile/ops/gemm_group_quant.hpp | 1 + .../kernel/gemm_quant_kernel.hpp | 11 +- .../kernel/grouped_gemm_quant_kernel.hpp | 433 +++++++++++++++++ .../pipeline/tile_gemm_quant_traits.hpp | 10 +- 12 files changed, 1225 insertions(+), 13 deletions(-) create mode 100644 example/ck_tile/17_grouped_gemm/quant_grouped_gemm.cpp create mode 100644 example/ck_tile/17_grouped_gemm/quant_grouped_gemm.hpp create mode 100644 example/ck_tile/17_grouped_gemm/quant_run_grouped_gemm_example.inc create mode 100644 include/ck_tile/ops/gemm_group_quant/kernel/grouped_gemm_quant_kernel.hpp diff --git a/example/ck_tile/17_grouped_gemm/CMakeLists.txt b/example/ck_tile/17_grouped_gemm/CMakeLists.txt index cf47dc60f1..8e8026d88d 100644 --- a/example/ck_tile/17_grouped_gemm/CMakeLists.txt +++ b/example/ck_tile/17_grouped_gemm/CMakeLists.txt @@ -1,2 +1,3 @@ add_executable(tile_example_grouped_gemm EXCLUDE_FROM_ALL grouped_gemm.cpp) +add_executable(tile_example_quant_grouped_gemm EXCLUDE_FROM_ALL quant_grouped_gemm.cpp) add_executable(tile_example_grouped_gemm_preshuffle EXCLUDE_FROM_ALL grouped_gemm_preshuffle.cpp) diff --git a/example/ck_tile/17_grouped_gemm/README.md b/example/ck_tile/17_grouped_gemm/README.md index 9b8950ea2c..94481fa7b7 100644 --- a/example/ck_tile/17_grouped_gemm/README.md +++ b/example/ck_tile/17_grouped_gemm/README.md @@ -175,6 +175,8 @@ mkdir build && cd build make tile_example_grouped_gemm -j # The preshuffle example make tile_example_grouped_gemm_preshuffle -j +# The quant grouped gemm fp8 example +make tile_example_quant_grouped_gemm -j ``` This will result in an executable `build/bin/tile_example_grouped_gemm` diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp index 221543c0af..ca46b638e6 100644 --- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp +++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp @@ -321,6 +321,36 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a throw std::runtime_error("Unsupported data layout configuration for A and B tensors!"); } } + +template