From fa8baf3ce670b777bd19db8f1056cd043dee1ef0 Mon Sep 17 00:00:00 2001
From: Ali Nouri <anouri@amd.com>
Date: Fri, 26 Sep 2025 22:35:44 +0000
Subject: [PATCH] No atomic passes!

---
 .../instances/fused_moegemm_api_internal.hpp     | 16 ++++++++++++----
 .../fused_moe/kernel/fused_moegemm_kernel.hpp    |  1 +
 2 files changed, 13 insertions(+), 4 deletions(-)
diff --git a/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp
index e504be82e1..43b5cd510f 100644
--- a/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp
@@ -5,6 +5,10 @@
 
 #include "fused_moegemm_api_traits.hpp"
 #include "ck_tile/ops/fused_moe.hpp"
+#include "ck_tile/ops/fused_moe/kernel/fused_moegemm_tile_partitioner_no_atomic.hpp"
+#include "ck_tile/ops/fused_moe/kernel/fused_moegemm_tile_partitioner_debug.hpp"
+#include "ck_tile/ops/fused_moe/kernel/fused_moegemm_tile_partitioner_simple_no_atomic.hpp"
+#include "ck_tile/ops/fused_moe/pipeline/fused_moegemm_pipeline_flatmm_policy_no_atomic.hpp"
 #include <iostream>
 
 template <ck_tile::index_t... Is>
@@ -14,7 +18,8 @@ using S = ck_tile::sequence<Is...>;
 template <typename Ts_>
 float fused_moegemm_(const ck_tile::stream_config& s, fused_moegemm_args a)
 {
-    using f_traits = ck_tile::FusedMoeGemmTraits<Ts_::GateOnly, Ts_::FusedQuant == 1, 1 /*atomic*/>;
+    // STEP 1: Test no-atomic with simple partitioner
+    using f_traits = ck_tile::FusedMoeGemmTraits<Ts_::GateOnly, Ts_::FusedQuant == 1, 0 /*NO atomic*/>;
     using f_shape  = ck_tile::FusedMoeGemmShape<typename Ts_::BlockTile_0,
                                                 typename Ts_::WarpPerBlock_0,
                                                 typename Ts_::WarpTile_0,
@@ -48,11 +53,14 @@ float fused_moegemm_(const ck_tile::stream_config& s, fused_moegemm_args a)
                                                            f_traits>;
 
     // using f_pipeline    = ck_tile::FusedMoeGemmPipeline_FlatmmEx<f_problem>;
-    using f_pipeline    = ck_tile::FusedMoeGemmPipeline_FlatmmUk<f_problem>;
-    using f_partitioner = ck_tile::FusedMoeGemmTilePartitioner_Linear<f_shape>;
+    // FINAL: Complete no-atomic implementation
+    using f_pipeline    = ck_tile::FusedMoeGemmPipeline_FlatmmUk<f_problem, ck_tile::FusedMoeGemmPipelineFlatmmPolicy_NoAtomic>;
+    // FINAL: Production-ready no-atomic partitioner with proper validation
+    using f_partitioner = ck_tile::FusedMoeGemmTilePartitioner_NoAtomic<f_shape>;
     using f_kernel      = ck_tile::FusedMoeGemmKernel<f_partitioner, f_pipeline, void>;
 
-    const dim3 grids                       = dim3(1, 1, 1); //f_kernel::GridSize(a);
+    // CHANGE 3: Use proper grid calculation (no longer hardcoded)
+    const dim3 grids                       = f_kernel::GridSize(a);
     const dim3 blocks                      = f_kernel::BlockSize();
     constexpr ck_tile::index_t kBlockPerCu = 1;
 
diff --git a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp
index 6d95decaee..ba886b7392 100644
--- a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_kernel.hpp
@@ -382,6 +382,7 @@ struct FusedMoeGemmKernel
                 return d_window_;
             }();
 
+            static_assert(false && "This code path should be hit");
             auto o_window = [&]() {
                 ODataType* o_ptr = reinterpret_cast<ODataType*>(kargs.o_ptr);
                 auto o_view_     = make_naive_tensor_view<address_space_enum::global,