diff --git a/include/ck_tile/core/tensor/tile_scatter_gather.hpp b/include/ck_tile/core/tensor/tile_scatter_gather.hpp
index 4b04fd513d..e6adc7d40b 100644
--- a/include/ck_tile/core/tensor/tile_scatter_gather.hpp
+++ b/include/ck_tile/core/tensor/tile_scatter_gather.hpp
@@ -125,7 +125,7 @@ struct tile_scatter_gather
 
         static constexpr auto get_space_filling_curve()
         {
-            constexpr auto tile_dstr = TileDstr{};
+            [[maybe_unused]] constexpr auto tile_dstr = TileDstr{};
 
             constexpr auto thread_tensor_lengths_ys =
                 to_sequence(tile_dstr.get_ys_to_d_descriptor().get_lengths());
@@ -309,7 +309,7 @@ struct tile_scatter_gather
     CK_TILE_DEVICE auto load(number<i_access_unsupport_>          = {},
                              bool_constant<oob_conditional_check> = {}) const
     {
-        constexpr auto tile_dstr = TileDstr{};
+        [[maybe_unused]] constexpr auto tile_dstr = TileDstr{};
         auto dst_tensor          = make_static_distributed_tensor<DataType>(tile_dstr);
         load(dst_tensor, number<i_access_unsupport_>{}, bool_constant<oob_conditional_check>{});
         return dst_tensor;
@@ -326,7 +326,7 @@ struct tile_scatter_gather
         using vector_t = typename Traits::vector_t;
         using SFC_Ys   = typename Traits::SFC_Ys;
 
-        constexpr auto tile_dstr = TileDstr{};
+        [[maybe_unused]] constexpr auto tile_dstr = TileDstr{};
 
         // loop over thread tensor space [y0, y1, ...]
         static_for<0, NumCoord, 1>{}([&](auto iCoord) {
@@ -418,7 +418,7 @@ struct tile_scatter_gather
         using vector_t      = typename Traits::vector_t;
         using SFC_Ys        = typename Traits::SFC_Ys;
 
-        constexpr auto tile_dstr = TileDstr{};
+        [[maybe_unused]] constexpr auto tile_dstr = TileDstr{};
 
         // Precompute invariant values outside loops
         const auto window_origin       = lds_tile.get_window_origin();
@@ -614,7 +614,7 @@ struct tile_scatter_gather
         using vector_t = typename Traits::vector_t;
         using SFC_Ys   = typename Traits::SFC_Ys;
 
-        constexpr auto tile_dstr = TileDstr{};
+        [[maybe_unused]] constexpr auto tile_dstr = TileDstr{};
 
         static_for<0, NumCoord, 1>{}([&](auto iCoord) {
             auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0];
@@ -696,7 +696,7 @@ struct tile_scatter_gather
         using vector_t = typename Traits::vector_t;
         using SFC_Ys   = typename Traits::SFC_Ys;
 
-        constexpr auto tile_dstr = TileDstr{};
+        [[maybe_unused]] constexpr auto tile_dstr = TileDstr{};
         // printf("off %d\n", page_idx_[I0]);
         // loop over thread tensor space [y0, y1, ...]
         static_for<0, NumCoord, 1>{}([&](auto iCoord) {
diff --git a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
index 17c88e4f08..159ed4d4c7 100644
--- a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -444,7 +444,7 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1
               typename BFlatBlockWindowTmp,
               typename DequantBFlatWindow>
     CK_TILE_HOST_DEVICE auto operator()(ADramBlockWindowTmp a_copy_dram_window_,
-                                        const AElementFunction& a_element_func,
+                                        [[maybe_unused]] const AElementFunction& a_element_func,
                                         const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp,
                                         const DequantBFlatWindow& scale_b_flat_window,
                                         const index_t num_loop,
@@ -606,7 +606,7 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1
             scale_b_warp_tensor_pong;
 
         using ABlockTile = decltype(load_tile(a_copy_dram_window));
-        ABlockTile a_block_tile;
+        [[maybe_unused]] ABlockTile a_block_tile;
 
         enum
         {
@@ -621,7 +621,7 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1
                 if constexpr(prefill_location & PrefillAfterGemm)
                     async_load_tile(lds_tile_a, dram_tile_a);
             };
-        auto prefill_lds_a_stage2 = [&](auto lds_tile_a) {
+        auto prefill_lds_a_stage2 = [&]([[maybe_unused]] auto lds_tile_a) {
             // async_load_fence();
             // __builtin_amdgcn_s_waitcnt(0x03fc);
             // data has been stored in lds, no need more operation.
diff --git a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
index f34c682b0f..f5954c29ab 100644
--- a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -7,7 +7,7 @@
 
 namespace ck_tile {
 
-#define CKTILE_FLATMM_USE_BUFFER_LOAD_LDS_AS_POSSIBLE 0
+#define CKTILE_FLATMM_USE_BUFFER_LOAD_LDS_AS_POSSIBLE 1
 
 #if defined(__gfx950__)
 #define CKTILE_FLATMM_ARCH_SUPPORT_BUFFER_LOAD_LDS_DWORDx4 1