crz idea

2026-06-05 20:55:59 +00:00 · 2025-07-28 08:24:51 +00:00
parent 5473f06461
commit 1b6d7cf407
4 changed files with 163 additions and 97 deletions
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -271,10 +271,10 @@ float flatmm_calc(const ck_tile::ScaleFlatmmHostArgs<ScaleM, ScaleN>& args,
        }
        else
        {
-            ave_time =
-                ck_tile::launch_kernel(s,
-                                       ck_tile::make_kernel<blocks.x, FlatmmConfig::kBlockPerCu>(
-                                           Kernel{}, grids, blocks, 0, kargs));
+            // ave_time =
+            //     ck_tile::launch_kernel(s,
+            //                            ck_tile::make_kernel<blocks.x, FlatmmConfig::kBlockPerCu>(
+            //                                Kernel{}, grids, blocks, 0, kargs));
        }
        return ave_time;
    };
@@ -289,10 +289,10 @@ float flatmm_calc(const ck_tile::ScaleFlatmmHostArgs<ScaleM, ScaleN>& args,
        }
        else
        {
-            Run(has_hot_loop_,
-                tail_number_,
-                ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                           ck_tile::memory_operation_enum::atomic_add>{});
+            // Run(has_hot_loop_,
+            //     tail_number_,
+            //     ck_tile::integral_constant<ck_tile::memory_operation_enum,
+            //                                ck_tile::memory_operation_enum::atomic_add>{});
        }
    };
    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
@@ -420,14 +420,14 @@ int run_flatmm_example(int argc, char* argv[])
    {
        if(data_type == "fp16")
        {
-            run_flatmm_example_with_layouts<ck_tile::half_t, FlatmmConfig<ck_tile::half_t>>(
-                argc, argv, Row{}, Col{}, Row{});
-        }
-        else if(data_type == "bf16")
-        {
-            run_flatmm_example_with_layouts<ck_tile::bf16_t, FlatmmConfig<ck_tile::bf16_t>>(
-                argc, argv, Row{}, Col{}, Row{});
+            // run_flatmm_example_with_layouts<ck_tile::half_t, FlatmmConfig<ck_tile::half_t>>(
+            //     argc, argv, Row{}, Col{}, Row{});
        }
+        // else if(data_type == "bf16")
+        // {
+        //     run_flatmm_example_with_layouts<ck_tile::bf16_t, FlatmmConfig<ck_tile::bf16_t>>(
+        //         argc, argv, Row{}, Col{}, Row{});
+        // }
        else if(data_type == "fp8")
        {
            if(scale_opt == 0)
@@ -441,19 +441,20 @@ int run_flatmm_example(int argc, char* argv[])
                    argc, argv, Row{}, Col{}, Row{});
            }
        }
-        else if(data_type == "bf8")
-        {
-            if(scale_opt == 0)
-            {
-                run_flatmm_example_with_layouts<ck_tile::bf8_t, FlatmmConfig<ck_tile::bf8_t>>(
-                    argc, argv, Row{}, Col{}, Row{});
-            }
-            else
-            {
-                run_flatmm_example_with_layouts<ck_tile::bf8_t, FlatmmConfig<ck_tile::bf8_t>, 1, 1>(
-                    argc, argv, Row{}, Col{}, Row{});
-            }
-        }
+        // else if(data_type == "bf8")
+        // {
+        //     if(scale_opt == 0)
+        //     {
+        //         run_flatmm_example_with_layouts<ck_tile::bf8_t, FlatmmConfig<ck_tile::bf8_t>>(
+        //             argc, argv, Row{}, Col{}, Row{});
+        //     }
+        //     else
+        //     {
+        //         run_flatmm_example_with_layouts<ck_tile::bf8_t, FlatmmConfig<ck_tile::bf8_t>, 1,
+        //         1>(
+        //             argc, argv, Row{}, Col{}, Row{});
+        //     }
+        // }
        else
        {
            throw std::runtime_error("Unsupported data_type!");
@@ -479,18 +480,18 @@ int main(int argc, char* argv[])
        {
            return !run_flatmm_example<FlatmmConfig16>(argc, argv);
        }
-        else if(warp_tile == 1)
-        {
-            return !run_flatmm_example<FlatmmConfig32>(argc, argv);
-        }
-        else if(warp_tile == 2)
-        {
-            return !run_flatmm_example<FlatmmConfig16_950>(argc, argv);
-        }
-        else
-        {
-            return !run_flatmm_example<FlatmmConfig32_950>(argc, argv);
-        }
+        // else if(warp_tile == 1)
+        // {
+        //     return !run_flatmm_example<FlatmmConfig32>(argc, argv);
+        // }
+        // else if(warp_tile == 2)
+        // {
+        //     return !run_flatmm_example<FlatmmConfig16_950>(argc, argv);
+        // }
+        // else
+        // {
+        //     return !run_flatmm_example<FlatmmConfig32_950>(argc, argv);
+        // }
    }
    catch(const std::runtime_error& e)
    {
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -56,6 +56,7 @@ int run_flatmm_example_with_layouts(int argc,
    if(init_method == 0)
    {
        ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
+        memset(a_host.data(), 0, 4);
        ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_origin_host);
        ck_tile::FillUniformDistribution<AccDataType>{-1.f, 1.f}(per_token_scale);
        ck_tile::FillUniformDistribution<AccDataType>{-1.f, 1.f}(per_channel_scale);
@@ -111,33 +112,6 @@ int run_flatmm_example_with_layouts(int argc,
    auto per_channel_scale_dev_ptr = ck_tile::FlatmmScalePointer<ScaleGranularityN>{
        static_cast<float*>(per_channel_scale_dev_buf.GetDeviceBuffer())};

-    invoke_flatmm<FlatmmConfig,
-                  ADataType,
-                  BDataType,
-                  ck_tile::tuple<>,
-                  AccDataType,
-                  CDataType,
-                  ALayout,
-                  BLayout,
-                  ck_tile::tuple<>,
-                  CLayout,
-                  decltype(per_token_scale_dev_ptr),
-                  decltype(per_channel_scale_dev_ptr)>(a_dev_buf,
-                                                       b_shuffle_dev_buf,
-                                                       c_dev_buf,
-                                                       M,
-                                                       N,
-                                                       K,
-                                                       stride_A,
-                                                       stride_B,
-                                                       stride_C,
-                                                       kbatch,
-                                                       per_token_scale_dev_ptr,
-                                                       per_channel_scale_dev_ptr,
-                                                       n_warmup,
-                                                       n_repeat);
-
-    c_dev_buf.FromDevice(c_rslt_host.data());
    bool pass = true;

    if(arg_parser.get_int("v") == 1)
@@ -236,6 +210,34 @@ int run_flatmm_example_with_layouts(int argc,
        ck_tile::hip_check_error(hipFree(d_B));
        ck_tile::hip_check_error(hipFree(d_C));

+        invoke_flatmm<FlatmmConfig,
+                      ADataType,
+                      BDataType,
+                      ck_tile::tuple<>,
+                      AccDataType,
+                      CDataType,
+                      ALayout,
+                      BLayout,
+                      ck_tile::tuple<>,
+                      CLayout,
+                      decltype(per_token_scale_dev_ptr),
+                      decltype(per_channel_scale_dev_ptr)>(a_dev_buf,
+                                                           b_shuffle_dev_buf,
+                                                           c_dev_buf,
+                                                           M,
+                                                           N,
+                                                           K,
+                                                           stride_A,
+                                                           stride_B,
+                                                           stride_C,
+                                                           kbatch,
+                                                           per_token_scale_dev_ptr,
+                                                           per_channel_scale_dev_ptr,
+                                                           n_warmup,
+                                                           n_repeat);
+
+        c_dev_buf.FromDevice(c_rslt_host.data());
+
        c_gpu_ref_dev_buf.FromDevice(c_gpu_ref_host.data());
        const float max_accumulated_value =
            *std::max_element(c_gpu_ref_host.mData.begin(), c_gpu_ref_host.mData.end());
--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
@@ -32,6 +32,11 @@ __launch_bounds__(MaxThreadPerBlock, MinBlockPerCu)
 #endif
 }

+template <int MaxThreadPerBlock, typename Kernel, typename... Args>
+__launch_bounds__(MaxThreadPerBlock) __global__ void kentry2(Args... args)
+{
+    Kernel{}(args...);
+}
 //
 // return a anonymous functor(lambda) to be called later
 // the KernelImpl should be a class without non-static data member, or let's say
--- a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
+++ b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
@@ -255,6 +255,34 @@ struct FlatmmKernel

    CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N, index_t KBatch)
    {
+        hipDeviceProp_t prop;
+        int deviceId = 0; // default device
+
+        constexpr int block_size = FlatmmKernel::BlockSize().x;
+        int dync_smem_size       = 0;
+        int maxActiveBlocksPerCU = 0;
+
+        [[maybe_unused]] auto e = hipGetDeviceProperties(&prop, deviceId);
+
+        e = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+            &maxActiveBlocksPerCU,
+            reinterpret_cast<void*>(
+                kentry2<block_size,
+                        FlatmmKernel,
+                        FlatmmKernelArgs<FlatmmScalePointer<-1>, FlatmmScalePointer<-1>, 0>>),
+            block_size,
+            dync_smem_size);
+
+        const int persistent_block_size = prop.multiProcessorCount * maxActiveBlocksPerCU;
+        const int total_work_tile_cnt   = TilePartitioner::GridSize(M, N);
+
+        std::cout << "maxActiveBlocksPerCU: " << maxActiveBlocksPerCU
+                  << ", persistent_block_size: " << persistent_block_size
+                  << ", total_work_tile_cnt: " << total_work_tile_cnt << std::endl;
+
+        assert(KBatch == 1);
+        return dim3(min(persistent_block_size, total_work_tile_cnt), 1, KBatch);
+
        return dim3(TilePartitioner::GridSize(M, N), 1, KBatch);
    }

@@ -751,37 +779,67 @@ struct FlatmmKernel
    CK_TILE_DEVICE void operator()(FlatmmKernelArgs<ScaleM, ScaleN, DsDataType::size()> kargs,
                                   int partition_idx = blockIdx.x) const
    {
-        const auto [iM, iN] = TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(partition_idx);
-        const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
-        const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
-
-        const SplitKBatchOffset splitk_batch_offset(kargs);
-        // options
-        const ADataType* a_ptr =
-            static_cast<const ADataType*>(kargs.a_ptr) + splitk_batch_offset.a_k_split_offset;
-        const BDataType* b_flat_ptr =
-            static_cast<const BDataType*>(kargs.b_ptr) + splitk_batch_offset.b_k_split_offset;
-        EDataType* e_ptr = static_cast<EDataType*>(kargs.e_ptr);
-
-        // allocate LDS
-        __shared__ char smem_ptr_ping[GetSmemPingSize()];
-        __shared__ char smem_ptr_pong[GetSmemPongSize()];
-
-        if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
-                       EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
-                       is_any_of<EDataType, fp16_t, bf16_t>::value))
+        int total_work_tile_cnt = TilePartitioner::GridSize(kargs.M, kargs.N);
+        // GWS
+        const int voffset = 0;
+        const int vdata   = 1;
+        __shared__ int shared_part[1];
+        if(threadIdx.x == 0)
        {
-            constexpr auto scheduler_type = (FlatmmPipeline::NumWaveGroups == 1);
-            RunFlatmm<ScaleM, ScaleN, scheduler_type>(a_ptr,
-                                                      b_flat_ptr,
-                                                      kargs.ds_ptr,
-                                                      e_ptr,
-                                                      smem_ptr_ping,
-                                                      smem_ptr_pong,
-                                                      kargs,
-                                                      splitk_batch_offset,
-                                                      i_m,
-                                                      i_n);
+            asm volatile("global_atomic_add %0, %1, %2, %3 sc0; \n\t"
+                         "s_waitcnt vmcnt(0); \n\t"
+                         : "=v"(partition_idx)
+                         : "v"(voffset), "v"(vdata), "s"(kargs.a_ptr));
+            shared_part[0] = partition_idx % (1024 + 80);
+        }
+        block_sync_lds();
+        partition_idx = shared_part[0];
+
+        while(partition_idx < total_work_tile_cnt)
+        {
+            const auto [iM, iN] =
+                TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(partition_idx);
+            const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
+            const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+
+            const SplitKBatchOffset splitk_batch_offset(kargs);
+            // options
+            const ADataType* a_ptr =
+                static_cast<const ADataType*>(kargs.a_ptr) + splitk_batch_offset.a_k_split_offset;
+            const BDataType* b_flat_ptr =
+                static_cast<const BDataType*>(kargs.b_ptr) + splitk_batch_offset.b_k_split_offset;
+            EDataType* e_ptr = static_cast<EDataType*>(kargs.e_ptr);
+
+            // allocate LDS
+            __shared__ char smem_ptr_ping[GetSmemPingSize()];
+            __shared__ char smem_ptr_pong[GetSmemPongSize()];
+
+            if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
+                           EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
+                           is_any_of<EDataType, fp16_t, bf16_t>::value))
+            {
+                constexpr auto scheduler_type = (FlatmmPipeline::NumWaveGroups == 1);
+                RunFlatmm<ScaleM, ScaleN, scheduler_type>(a_ptr,
+                                                          b_flat_ptr,
+                                                          kargs.ds_ptr,
+                                                          e_ptr,
+                                                          smem_ptr_ping,
+                                                          smem_ptr_pong,
+                                                          kargs,
+                                                          splitk_batch_offset,
+                                                          i_m,
+                                                          i_n);
+            }
+            if(threadIdx.x == 0)
+            {
+                asm volatile("global_atomic_add %0, %1, %2, %3 sc0; \n\t"
+                             "s_waitcnt vmcnt(0); \n\t"
+                             : "=v"(partition_idx)
+                             : "v"(voffset), "v"(vdata), "s"(kargs.a_ptr));
+                shared_part[0] = partition_idx % (1024 + 80);
+            }
+            block_sync_lds();
+            partition_idx = shared_part[0];
        }
    }
 };