diff --git a/include/ck_tile/core/tensor/tile_scatter_gather.hpp b/include/ck_tile/core/tensor/tile_scatter_gather.hpp index 4b04fd513d..e6adc7d40b 100644 --- a/include/ck_tile/core/tensor/tile_scatter_gather.hpp +++ b/include/ck_tile/core/tensor/tile_scatter_gather.hpp @@ -125,7 +125,7 @@ struct tile_scatter_gather static constexpr auto get_space_filling_curve() { - constexpr auto tile_dstr = TileDstr{}; + [[maybe_unused]] constexpr auto tile_dstr = TileDstr{}; constexpr auto thread_tensor_lengths_ys = to_sequence(tile_dstr.get_ys_to_d_descriptor().get_lengths()); @@ -309,7 +309,7 @@ struct tile_scatter_gather CK_TILE_DEVICE auto load(number = {}, bool_constant = {}) const { - constexpr auto tile_dstr = TileDstr{}; + [[maybe_unused]] constexpr auto tile_dstr = TileDstr{}; auto dst_tensor = make_static_distributed_tensor(tile_dstr); load(dst_tensor, number{}, bool_constant{}); return dst_tensor; @@ -326,7 +326,7 @@ struct tile_scatter_gather using vector_t = typename Traits::vector_t; using SFC_Ys = typename Traits::SFC_Ys; - constexpr auto tile_dstr = TileDstr{}; + [[maybe_unused]] constexpr auto tile_dstr = TileDstr{}; // loop over thread tensor space [y0, y1, ...] static_for<0, NumCoord, 1>{}([&](auto iCoord) { @@ -418,7 +418,7 @@ struct tile_scatter_gather using vector_t = typename Traits::vector_t; using SFC_Ys = typename Traits::SFC_Ys; - constexpr auto tile_dstr = TileDstr{}; + [[maybe_unused]] constexpr auto tile_dstr = TileDstr{}; // Precompute invariant values outside loops const auto window_origin = lds_tile.get_window_origin(); @@ -614,7 +614,7 @@ struct tile_scatter_gather using vector_t = typename Traits::vector_t; using SFC_Ys = typename Traits::SFC_Ys; - constexpr auto tile_dstr = TileDstr{}; + [[maybe_unused]] constexpr auto tile_dstr = TileDstr{}; static_for<0, NumCoord, 1>{}([&](auto iCoord) { auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0]; @@ -696,7 +696,7 @@ struct tile_scatter_gather using vector_t = typename Traits::vector_t; using SFC_Ys = typename Traits::SFC_Ys; - constexpr auto tile_dstr = TileDstr{}; + [[maybe_unused]] constexpr auto tile_dstr = TileDstr{}; // printf("off %d\n", page_idx_[I0]); // loop over thread tensor space [y0, y1, ...] static_for<0, NumCoord, 1>{}([&](auto iCoord) { diff --git a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp index 17c88e4f08..159ed4d4c7 100644 --- a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp +++ b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp @@ -444,7 +444,7 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1 typename BFlatBlockWindowTmp, typename DequantBFlatWindow> CK_TILE_HOST_DEVICE auto operator()(ADramBlockWindowTmp a_copy_dram_window_, - const AElementFunction& a_element_func, + [[maybe_unused]] const AElementFunction& a_element_func, const BFlatBlockWindowTmp& b_flat_dram_block_window_tmp, const DequantBFlatWindow& scale_b_flat_window, const index_t num_loop, @@ -606,7 +606,7 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1 scale_b_warp_tensor_pong; using ABlockTile = decltype(load_tile(a_copy_dram_window)); - ABlockTile a_block_tile; + [[maybe_unused]] ABlockTile a_block_tile; enum { @@ -621,7 +621,7 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1 if constexpr(prefill_location & PrefillAfterGemm) async_load_tile(lds_tile_a, dram_tile_a); }; - auto prefill_lds_a_stage2 = [&](auto lds_tile_a) { + auto prefill_lds_a_stage2 = [&]([[maybe_unused]] auto lds_tile_a) { // async_load_fence(); // __builtin_amdgcn_s_waitcnt(0x03fc); // data has been stored in lds, no need more operation. diff --git a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp index f34c682b0f..f5954c29ab 100644 --- a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp +++ b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp @@ -7,7 +7,7 @@ namespace ck_tile { -#define CKTILE_FLATMM_USE_BUFFER_LOAD_LDS_AS_POSSIBLE 0 +#define CKTILE_FLATMM_USE_BUFFER_LOAD_LDS_AS_POSSIBLE 1 #if defined(__gfx950__) #define CKTILE_FLATMM_ARCH_SUPPORT_BUFFER_LOAD_LDS_DWORDx4 1