Rename InterleavedPKTypeLoader to ConverterLoader, and load_int4_tile to load_and_convert_tile

2026-07-01 04:07:56 +00:00 · 2025-11-27 08:35:18 +00:00
parent 3a094e2f8b
commit cfa11f2d1f
12 changed files with 41 additions and 35 deletions
--- a/include/ck_tile/ops/common/load_and_convert_tile.hpp
+++ b/include/ck_tile/ops/common/load_and_convert_tile.hpp
@@ -9,7 +9,7 @@
 namespace ck_tile {

 template <typename DstDataType, index_t UnaryOpSize>
-struct InterleavedPKTypeLoader
+struct ConverterLoader
 {
    template <typename WarpWindow, typename WarpTile>
    CK_TILE_DEVICE static void load_interleaved_pk_type(WarpTile& dst, const WarpWindow& src)
@@ -34,12 +34,12 @@ template <typename SrcDataType,
          bool LoadTranspose = false,
          typename WarpTile,
          typename WarpWindow>
-CK_TILE_DEVICE void load_int4_tile(WarpTile& dst, const WarpWindow& src)
+CK_TILE_DEVICE void load_and_convert_tile(WarpTile& dst, const WarpWindow& src)
 {
    if constexpr(std::is_same_v<SrcDataType, pk_int4_t>)
    {
        static_assert(!LoadTranspose, "LoadTranspose not supported with pk_int4_t");
-        InterleavedPKTypeLoader<DstDataType, UnaryOpSize>::load_interleaved_pk_type(dst, src);
+        ConverterLoader<DstDataType, UnaryOpSize>::load_interleaved_pk_type(dst, src);
    }
    else if constexpr(LoadTranspose)
    {
--- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
@@ -228,10 +228,10 @@ struct BlockUniversalGemmAsBsCr
                          "The ADataType and BDataType as defined in "
                          "traits should be the same as correspoinding block window data type!");

-            load_int4_tile<ADataType, ATypeToUse, UnaryOpSize_, ALoadTranspose>(a_warp_tile_,
-                                                                                a_block_window);
-            load_int4_tile<BDataType, BTypeToUse, UnaryOpSize_, BLoadTranspose>(b_warp_tile_,
-                                                                                b_block_window);
+            load_and_convert_tile<ADataType, ATypeToUse, UnaryOpSize_, ALoadTranspose>(
+                a_warp_tile_, a_block_window);
+            load_and_convert_tile<BDataType, BTypeToUse, UnaryOpSize_, BLoadTranspose>(
+                b_warp_tile_, b_block_window);
            // hot loop:
            static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
@@ -294,10 +294,10 @@ struct BlockUniversalGemmAsBsCr
                                          bool_constant<ALoadTranspose> = {},
                                          bool_constant<BLoadTranspose> = {})
        {
-            load_int4_tile<ADataType, ATypeToUse, UnaryOpSize_, ALoadTranspose>(a_warp_tile_,
-                                                                                a_block_window);
-            load_int4_tile<BDataType, BTypeToUse, UnaryOpSize_, BLoadTranspose>(b_warp_tile_,
-                                                                                b_block_window);
+            load_and_convert_tile<ADataType, ATypeToUse, UnaryOpSize_, ALoadTranspose>(
+                a_warp_tile_, a_block_window);
+            load_and_convert_tile<BDataType, BTypeToUse, UnaryOpSize_, BLoadTranspose>(
+                b_warp_tile_, b_block_window);
        }

        // C += A * B
@@ -425,10 +425,10 @@ struct BlockUniversalGemmAsBsCr
            auto b_lds_gemm_window = make_tile_window(
                b_block_window.get_bottom_tensor_view(), b_lds_shape, b_offset, b_lds_load_distr);

-            load_int4_tile<ADataType, ATypeToUse, UnaryOpSize_, ALoadTranspose>(a_warp_tile_,
-                                                                                a_lds_gemm_window);
-            load_int4_tile<BDataType, BTypeToUse, UnaryOpSize_, BLoadTranspose>(b_warp_tile_,
-                                                                                b_lds_gemm_window);
+            load_and_convert_tile<ADataType, ATypeToUse, UnaryOpSize_, ALoadTranspose>(
+                a_warp_tile_, a_lds_gemm_window);
+            load_and_convert_tile<BDataType, BTypeToUse, UnaryOpSize_, BLoadTranspose>(
+                b_warp_tile_, b_lds_gemm_window);
        }

        // C += A * B
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp
@@ -74,7 +74,8 @@ struct GemmPipelineAgBgCrImplBase
                                       SrcTileWindow& dram_tile_window,
                                       const DramTileWindowStep& dram_tile_window_step) const
    {
-        load_int4_tile<SrcDataType, DstDataType, UnaryOpSize>(dst_block_tile, dram_tile_window);
+        load_and_convert_tile<SrcDataType, DstDataType, UnaryOpSize>(dst_block_tile,
+                                                                     dram_tile_window);
        move_tile_window(dram_tile_window, dram_tile_window_step);
    }

--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_bquant_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_bquant_cr.hpp
@@ -261,10 +261,10 @@ struct ABQuantBlockUniversalGemmAsBsCr : public BlockGemmQuantBase
                                          bool_constant<ALoadTranspose> = {},
                                          bool_constant<BLoadTranspose> = {})
        {
-            load_int4_tile<ADataType, ComputeDataType, UnaryOpSize_, ALoadTranspose>(
+            load_and_convert_tile<ADataType, ComputeDataType, UnaryOpSize_, ALoadTranspose>(
                a_warp_tile_, a_block_window);
            // If B datatype were pkint4 it would be converted prior to storing in LDS
-            load_int4_tile<OverrideBDataType, ComputeDataType, UnaryOpSize_, BLoadTranspose>(
+            load_and_convert_tile<OverrideBDataType, ComputeDataType, UnaryOpSize_, BLoadTranspose>(
                b_warp_tile_, b_block_window);
        }

--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
@@ -248,9 +248,9 @@ struct AQuantBlockUniversalGemmAsBsCr
            // while ADatatype might not be the same as BDataType at the time of problem
            // initialization, we can safely use BDataType here because when A would be int4 we will
            // ensure A is converted to BDataType prior to loading
-            load_int4_tile<BDataType, ComputeDataType, UnaryOpSize_, ALoadTranspose>(
+            load_and_convert_tile<BDataType, ComputeDataType, UnaryOpSize_, ALoadTranspose>(
                a_warp_tile_, a_block_window);
-            load_int4_tile<BDataType, ComputeDataType, UnaryOpSize_, BLoadTranspose>(
+            load_and_convert_tile<BDataType, ComputeDataType, UnaryOpSize_, BLoadTranspose>(
                b_warp_tile_, b_block_window);
        }

--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp
@@ -258,10 +258,10 @@ struct BQuantBlockUniversalGemmAsBsCr
                                          bool_constant<ALoadTranspose> = {},
                                          bool_constant<BLoadTranspose> = {})
        {
-            load_int4_tile<ADataType, ComputeDataType, UnaryOpSize_, ALoadTranspose>(
+            load_and_convert_tile<ADataType, ComputeDataType, UnaryOpSize_, ALoadTranspose>(
                a_warp_tile_, a_block_window);
            // If B datatype were pkint4 it would be converted prior to storing in LDS
-            load_int4_tile<OverrideBDataType, ComputeDataType, UnaryOpSize_, BLoadTranspose>(
+            load_and_convert_tile<OverrideBDataType, ComputeDataType, UnaryOpSize_, BLoadTranspose>(
                b_warp_tile_, b_block_window);
        }

--- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_abquant_pipeline_ag_bg_cr_v3.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_abquant_pipeline_ag_bg_cr_v3.hpp
@@ -200,7 +200,8 @@ struct ABQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Pro
            using DestDataType            = typename ABlockTile_::DataType;
            using SrcDataType             = typename ADramWindow::Base::TileWindowBase::DataType;
            constexpr index_t UnaryOpSize = 8;
-            load_int4_tile<SrcDataType, DestDataType, UnaryOpSize>(a_block_tile, a_dram_window);
+            load_and_convert_tile<SrcDataType, DestDataType, UnaryOpSize>(a_block_tile,
+                                                                          a_dram_window);
        }

        template <typename BDramWindow, typename BBlockTile_>
@@ -210,7 +211,8 @@ struct ABQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Pro
            using DestDataType            = typename BBlockTile_::DataType;
            using SrcDataType             = typename BDramWindow::Base::TileWindowBase::DataType;
            constexpr index_t UnaryOpSize = 8;
-            load_int4_tile<SrcDataType, DestDataType, UnaryOpSize>(b_block_tile, b_dram_window);
+            load_and_convert_tile<SrcDataType, DestDataType, UnaryOpSize>(b_block_tile,
+                                                                          b_dram_window);
        }

        template <bool HasHotLoop,
--- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_mem.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_mem.hpp
@@ -177,7 +177,8 @@ struct AQuantGemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
            using DestDataType            = typename ABlockTile_::DataType;
            using SrcDataType             = typename ADramWindow::Base::TileWindowBase::DataType;
            constexpr index_t UnaryOpSize = 8;
-            load_int4_tile<SrcDataType, DestDataType, UnaryOpSize>(a_block_tile, a_dram_window);
+            load_and_convert_tile<SrcDataType, DestDataType, UnaryOpSize>(a_block_tile,
+                                                                          a_dram_window);
            move_tile_window(a_dram_window, dram_tile_window_step);
        }

--- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp
@@ -171,7 +171,8 @@ struct AQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Prob
            using DestDataType            = typename ABlockTile_::DataType;
            using SrcDataType             = typename ADramWindow::Base::TileWindowBase::DataType;
            constexpr index_t UnaryOpSize = 8;
-            load_int4_tile<SrcDataType, DestDataType, UnaryOpSize>(a_block_tile, a_dram_window);
+            load_and_convert_tile<SrcDataType, DestDataType, UnaryOpSize>(a_block_tile,
+                                                                          a_dram_window);
        }

        template <bool HasHotLoop,
--- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_v3.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_v3.hpp
@@ -186,7 +186,8 @@ struct BQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Prob
            using DestDataType            = typename BBlockTile_::DataType;
            using SrcDataType             = typename BDramWindow::Base::TileWindowBase::DataType;
            constexpr index_t UnaryOpSize = 8;
-            load_int4_tile<SrcDataType, DestDataType, UnaryOpSize>(b_block_tile, b_dram_window);
+            load_and_convert_tile<SrcDataType, DestDataType, UnaryOpSize>(b_block_tile,
+                                                                          b_dram_window);
        }

        template <typename BBlockTile_, typename BDramWindow, typename BDramTileWindowStep>
--- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_abquant_pipeline_ag_bg_cr_v2.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_abquant_pipeline_ag_bg_cr_v2.hpp
@@ -349,7 +349,7 @@ struct WPABQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRe
                move_tile_window(b_flat_dram_windows(nIter)(kIter),
                                 {nIter * flatNPerWarp, kIter * flatKPerWarp});

-                load_int4_tile<BDataType, ADataType, UnaryOpSize_>(
+                load_and_convert_tile<BDataType, ADataType, UnaryOpSize_>(
                    b_warp_tensor_ping(nIter)(kIter), b_flat_dram_windows(nIter)(kIter));
            });
        });
@@ -430,7 +430,7 @@ struct WPABQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRe

                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
                                     {nIter * flatNPerWarp, kIter * flatKPerWarp});
-                    load_int4_tile<BDataType, ADataType, UnaryOpSize_>(
+                    load_and_convert_tile<BDataType, ADataType, UnaryOpSize_>(
                        b_warp_tensor_pong(nIter)(kIter), b_flat_dram_windows(nIter)(kIter));
                });
            });
@@ -455,7 +455,7 @@ struct WPABQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRe

                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
                                     {nIter * flatNPerWarp, kIter * flatKPerWarp});
-                    load_int4_tile<BDataType, ADataType, UnaryOpSize_>(
+                    load_and_convert_tile<BDataType, ADataType, UnaryOpSize_>(
                        b_warp_tensor_ping(nIter)(kIter), b_flat_dram_windows(nIter)(kIter));
                });
            });
@@ -503,7 +503,7 @@ struct WPABQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRe
                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
                                     {nIter * flatNPerWarp, kIter * flatKPerWarp});

-                    load_int4_tile<BDataType, ADataType, UnaryOpSize_>(
+                    load_and_convert_tile<BDataType, ADataType, UnaryOpSize_>(
                        b_warp_tensor_pong(nIter)(kIter), b_flat_dram_windows(nIter)(kIter));
                });
            });
--- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp
@@ -335,7 +335,7 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV
                move_tile_window(b_flat_dram_windows(nIter)(kIter),
                                 {nIter * flatNPerWarp, kIter * flatKPerWarp});

-                load_int4_tile<BDataType, ADataType, UnaryOpSize_>(
+                load_and_convert_tile<BDataType, ADataType, UnaryOpSize_>(
                    b_warp_tensor_ping(nIter)(kIter), b_flat_dram_windows(nIter)(kIter));
            });
        });
@@ -421,7 +421,7 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV

                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
                                     {nIter * flatNPerWarp, kIter * flatKPerWarp});
-                    load_int4_tile<BDataType, ADataType, UnaryOpSize_>(
+                    load_and_convert_tile<BDataType, ADataType, UnaryOpSize_>(
                        b_warp_tensor_pong(nIter)(kIter), b_flat_dram_windows(nIter)(kIter));
                });
            });
@@ -458,7 +458,7 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV

                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
                                     {nIter * flatNPerWarp, kIter * flatKPerWarp});
-                    load_int4_tile<BDataType, ADataType, UnaryOpSize_>(
+                    load_and_convert_tile<BDataType, ADataType, UnaryOpSize_>(
                        b_warp_tensor_ping(nIter)(kIter), b_flat_dram_windows(nIter)(kIter));
                });
            });
@@ -516,7 +516,7 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV
                    move_tile_window(b_flat_dram_windows(nIter)(kIter),
                                     {nIter * flatNPerWarp, kIter * flatKPerWarp});

-                    load_int4_tile<BDataType, ADataType, UnaryOpSize_>(
+                    load_and_convert_tile<BDataType, ADataType, UnaryOpSize_>(
                        b_warp_tensor_pong(nIter)(kIter), b_flat_dram_windows(nIter)(kIter));
                });
            });