still debugging: speculating soemthing with cshuffle epilogue

2026-07-01 12:17:00 +00:00 · 2026-01-16 20:53:14 +00:00
parent cece6c0c2c
commit ee426bea45
5 changed files with 33 additions and 35 deletions
--- a/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
+++ b/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
@@ -18,9 +18,9 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
        # gemm_bquant_quantgrouped_fp8i4.cpp
        # gemm_bquant_quantgrouped_bf16mxfp4.cpp
        # gemm_bquant_quantgrouped_bf8.cpp
-        gemm_bquant_quantgrouped_fp8.cpp
+        # gemm_bquant_quantgrouped_fp8.cpp
        gemm_bquant_quantgrouped_preshuffleb.cpp
-        gemm_bquant_quantgrouped_preshufflequant.cpp
+        # gemm_bquant_quantgrouped_preshufflequant.cpp
        # gemm_bquant_quantgrouped_preshuffleb_preshufflequant.cpp
        # gemm_quant_rowcol.cpp
        # gemm_quant_tensor.cpp
--- a/example/ck_tile/38_block_scale_gemm/gemm_quant.cpp
+++ b/example/ck_tile/38_block_scale_gemm/gemm_quant.cpp
@@ -89,8 +89,8 @@ auto gen_lut_key(const ck_tile::ArgParser& arg_parser)
 //     std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut);
 // void aquant_quantgrouped_preshufflequant_instance_factory(
 //     std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut);
-void bquant_quantgrouped_fp8_instance_factory(
-    std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut);
+// void bquant_quantgrouped_fp8_instance_factory(
+//     std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut);
 // void bquant_quantgrouped_bf8_instance_factory(
 //     std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut);
 // void bquant_quantgrouped_fp8i4_instance_factory(
@@ -101,8 +101,8 @@ void bquant_quantgrouped_fp8_instance_factory(
 //     std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut);
 void bquant_quantgrouped_preshuffleb_instance_factory(
    std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut);
-void bquant_quantgrouped_preshufflequant_instance_factory(
-    std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut);
+// void bquant_quantgrouped_preshufflequant_instance_factory(
+//     std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut);
 // void bquant_quantgrouped_preshuffleb_preshufflequant_instance_factory(
 //     std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>>& lut);
 // void quant_rowcol_instance_factory(
@@ -126,13 +126,13 @@ int main(int argc, char* argv[])
    std::unordered_map<size_t, std::function<int(const ck_tile::ArgParser&)>> lut;
    // aquant_quantgrouped_instance_factory(lut);
    // aquant_quantgrouped_preshufflequant_instance_factory(lut);
-    bquant_quantgrouped_fp8_instance_factory(lut);
+    // bquant_quantgrouped_fp8_instance_factory(lut);
    // bquant_quantgrouped_bf8_instance_factory(lut);
    // bquant_quantgrouped_fp8i4_instance_factory(lut);
    // bquant_quantgrouped_bf8i4_instance_factory(lut);
    // bquant_quantgrouped_bf16fp4_instance_factory(lut);
    bquant_quantgrouped_preshuffleb_instance_factory(lut);
-    bquant_quantgrouped_preshufflequant_instance_factory(lut);
+    // bquant_quantgrouped_preshufflequant_instance_factory(lut);
    // bquant_quantgrouped_preshuffleb_preshufflequant_instance_factory(lut);
    // quant_rowcol_instance_factory(lut);
    // quant_tensor_instance_factory(lut);
--- a/example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc
+++ b/example/ck_tile/38_block_scale_gemm/run_gemm_quant_example.inc
@@ -521,12 +521,13 @@ int run_gemm_example_with_layouts(const ck_tile::ArgParser& arg_parser,
            }
            else
            {
-                ck_tile::FillUniformDistribution<BDataType>{-2.0f, 3.0f, fill_seed(gen)}(b_k_n);
-                ck_tile::FillUniformDistribution<BQDataType>{-2.0f, 2.0f, fill_seed(gen)}(
+                ck_tile::FillUniformDistribution<BDataType>{-2.0f,
+                                                            3.0f /*, fill_seed(gen)*/}(b_k_n);
+                ck_tile::FillUniformDistribution<BQDataType>{1.0f, 1.0f /*, fill_seed(gen)*/}(
                    *bq_tensor_ptr);
            }

-            ck_tile::FillUniformDistribution<ADataType>{-5.0f, 5.0f, fill_seed(gen)}(a_m_k);
+            ck_tile::FillUniformDistribution<ADataType>{-5.0f, 5.0f /*, fill_seed(gen)*/}(a_m_k);
        }
        else if constexpr(QuantMode == ck_tile::QuantType::AQuantGrouped)
        {
@@ -572,7 +573,7 @@ int run_gemm_example_with_layouts(const ck_tile::ArgParser& arg_parser,
                {
                    for(int j = 0; j < BQN; j++)
                    {
-                        (*bq_tensor_ptr)(i, j) = value;
+                        (*bq_tensor_ptr)(i, j) = 1.0;
                        value += static_cast<BQDataType>(1.0f);
                    }
                }
--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp
@@ -240,18 +240,19 @@ struct BlockGemmWeightPreshuffleBQuantARegBRegCReg
                        auto& scale_reg   = bq_block_tensor.get_thread_buffer()[reg_offset];
                        float scale_reg_f = cvt_scale_to_fp32(scale_reg);

-                        printf("get_block_id(): %d, get_warp_id(): %d, get_thread_id(): %d, nIter: "
-                               "%d, KPerBlockBQ: %d, "
-                               "kQScale: %d, scale_reg: %f, "
-                               "scale_reg_f: %f\n",
-                               get_block_id(),
-                               get_warp_id(),
-                               get_thread_id(),
-                               static_cast<int>(nIter),
-                               KPerBlockBQ,
-                               static_cast<int>(kQScale),
-                               scale_reg,
-                               scale_reg_f);
+                        // printf("get_block_id(): %d, get_warp_id(): %d, get_thread_id(): %d,
+                        // nIter: "
+                        //        "%d, KPerBlockBQ: %d, "
+                        //        "kQScale: %d, scale_reg: %f, "
+                        //        "scale_reg_f: %f\n",
+                        //        get_block_id(),
+                        //        get_warp_id(),
+                        //        get_thread_id(),
+                        //        static_cast<int>(nIter),
+                        //        KPerBlockBQ,
+                        //        static_cast<int>(kQScale),
+                        //        scale_reg,
+                        //        scale_reg_f);

                        static_for<0, WG::kM * WG::kN / warp_size, 1>{}([&](auto c_row) {
                            auto& c_ref = c_block_tensor.get_thread_buffer()[tbuf_offset + c_row];
@@ -263,14 +264,14 @@ struct BlockGemmWeightPreshuffleBQuantARegBRegCReg
                });
            });
        });
-        // auto c_thread_buffer      = c_block_tensor.get_thread_buffer();
+        auto c_thread_buffer = c_block_tensor.get_thread_buffer();
        // printf("C Data:\n");
-        // for(index_t i = 0; i < c_thread_buffer.size(); ++i)
-        // {
-        //     auto value = c_thread_buffer.get(i);
-        //     auto float_value = type_convert<float>(value);
-        //     printf("  [%d] = %f\n", i, float_value);
-        // }
+        for(index_t i = 0; i < c_thread_buffer.size(); ++i)
+        {
+            auto value       = c_thread_buffer.get(i);
+            auto float_value = type_convert<float>(value);
+            printf("[%d] = %f\n", i, float_value);
+        }
    }
 };

--- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp
@@ -390,10 +390,6 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV

        // MAIN LOOP
        index_t iCounter = (num_loop - 1) / loop_count;
-        if(get_block_id() == 0 && get_thread_id() == 0)
-        {
-            printf("iCounter:%d \n\n ", iCounter);
-        }
        while(iCounter > 0)
        {
            __builtin_amdgcn_sched_barrier(0);