From f14ee90152a64850128a34144643a660239533f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Tue, 31 Mar 2026 10:02:24 +0200 Subject: [PATCH] [CK][CK Tile] Force padding for atomic_add bf16 C tensor (#5842) ## Motivation Force padding for atomic_add bf16 C tensor to avoid memfaults. ## Technical Details - add global atomic add for bf16 and enable them - add padding for atomic add bf16 due to the lack of oob - remove padding for not continous dims in conv for other cases - minor bwd data conv fixes ## Test Plan test_grouped_conv_*_tile ## Test Result pending ## Submission Checklist - [x] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. --- .../core/arch/amd_buffer_addressing.hpp | 67 ++++++- .../arch/amd_buffer_addressing_builtins.hpp | 65 ++++++- include/ck_tile/core/tensor/buffer_view.hpp | 4 +- .../ops/gemm/kernel/universal_gemm_kernel.hpp | 9 +- ...ouped_convolution_backward_data_kernel.hpp | 164 +++--------------- ...ped_convolution_backward_weight_kernel.hpp | 17 +- .../grouped_convolution_forward_kernel.hpp | 19 +- 7 files changed, 174 insertions(+), 171 deletions(-) diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index f7dc610717..a32f26dadf 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -18,6 +18,10 @@ #include "ck_tile/core/utility/ignore.hpp" #include "ck_tile/core/arch/amd_buffer_coherence.hpp" +#define HAS_GLOBAL_ATOMIC_PK_ADD_BUILTIN \ + __has_builtin(__builtin_amdgcn_global_atomic_fadd_v2f16) && \ + __has_builtin(__builtin_amdgcn_global_atomic_fadd_v2bf16) + // This attribute gives a hint to the compiler that a branch is likely to be taken. // Then, the compiler should remove if possible the associated s_cbranch_execz branch that would // have been generated. @@ -2317,6 +2321,34 @@ CK_TILE_DEVICE void amd_buffer_store_raw_impl(const thread_buffer& dst_thr } } +template +CK_TILE_DEVICE void +amd_global_atomic_add_impl([[maybe_unused]] const thread_buffer& src_thread_data, + [[maybe_unused]] T* addr) +{ + static_assert((std::is_same::value && (N == 2 || N == 4 || N == 8)) || + (std::is_same::value && (N == 2 || N == 4 || N == 8)), + "wrong! not implemented"); + +#if HAS_GLOBAL_ATOMIC_PK_ADD_BUILTIN + if constexpr(__has_builtin(__builtin_amdgcn_global_atomic_fadd_v2bf16) && + std::is_same::value) + { + static_for<0, N / 2, 1>{}([&](auto i) { + __builtin_amdgcn_global_atomic_fadd_v2bf16( + bit_cast(addr) + i, + src_thread_data.template get_as()[i]); + }); + } + else + { + static_assert(false, "Not supported!"); + } +#else + static_assert(false, "Not supported!"); +#endif +} + template CK_TILE_DEVICE void amd_buffer_atomic_add_impl(const thread_buffer& src_thread_data, int32x4_t dst_wave_buffer_resource, @@ -2325,8 +2357,11 @@ CK_TILE_DEVICE void amd_buffer_atomic_add_impl(const thread_buffer& src_th { static_assert((std::is_same::value && (N == 1 || N == 2 || N == 4)) || (std::is_same::value && (N == 2 || N == 4 || N == 8)) || - (std::is_same::value && (N == 2 || N == 4 || N == 8)) || - (std::is_same::value && (N == 1 || N == 2 || N == 4)), + (std::is_same::value && (N == 1 || N == 2 || N == 4)) +#if defined(__gfx950__) + || (std::is_same::value && (N == 2 || N == 4 || N == 8)) +#endif + , "wrong! not implemented"); if constexpr(std::is_same::value) @@ -2931,16 +2966,27 @@ CK_TILE_DEVICE void amd_buffer_atomic_add(const thread_buffer& src_thread_ const bool dst_thread_element_valid, const index_t dst_element_space_size) { - const int32x4_t dst_wave_buffer_resource = - make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T)); - - index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T); +#if defined(__gfx942__) + if constexpr(std::is_same::value) + { + if(dst_thread_element_valid) + { + amd_global_atomic_add_impl(src_thread_data, + p_dst_wave + dst_thread_element_offset); + } + } + else + { +#endif + const int32x4_t dst_wave_buffer_resource = + make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T)); + index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T); #if CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK - uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000; + uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000; - amd_buffer_atomic_add_impl( - src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0); + amd_buffer_atomic_add_impl( + src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0); #else if(dst_thread_element_valid) { @@ -2948,6 +2994,9 @@ CK_TILE_DEVICE void amd_buffer_atomic_add(const thread_buffer& src_thread_ src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0); } #endif +#if defined(__gfx942__) + } +#endif } template & dst_thr } } +template +CK_TILE_DEVICE void +amd_global_atomic_add_impl([[maybe_unused]] const thread_buffer& src_thread_data, + [[maybe_unused]] T* addr) +{ + static_assert((std::is_same::value && (N == 2 || N == 4 || N == 8)) || + (std::is_same::value && (N == 2 || N == 4 || N == 8)), + "wrong! not implemented"); + +#if HAS_GLOBAL_ATOMIC_PK_ADD_BUILTIN + if constexpr(std::is_same::value) + { + static_for<0, N / 2, 1>{}([&](auto i) { + __builtin_amdgcn_global_atomic_fadd_v2bf16( + bit_cast(addr) + i, + src_thread_data.template get_as()[i]); + }); + } + else + { + static_assert(false, "Not supported!"); + } +#else + static_assert(false, "Not supported!"); +#endif +} + template CK_TILE_DEVICE void amd_buffer_atomic_add_impl(const thread_buffer& src_thread_data, int32x4_t dst_wave_buffer_resource, @@ -2151,8 +2182,11 @@ CK_TILE_DEVICE void amd_buffer_atomic_add_impl(const thread_buffer& src_th { static_assert((std::is_same::value && (N == 1 || N == 2 || N == 4)) || (std::is_same::value && (N == 2 || N == 4 || N == 8)) || - (std::is_same::value && (N == 2 || N == 4 || N == 8)) || - (std::is_same::value && (N == 1 || N == 2 || N == 4)), + (std::is_same::value && (N == 1 || N == 2 || N == 4)) +#if defined(__gfx950__) + || (std::is_same::value && (N == 2 || N == 4 || N == 8)) +#endif + , "wrong! not implemented"); if constexpr(std::is_same::value) @@ -2759,16 +2793,28 @@ CK_TILE_DEVICE void amd_buffer_atomic_add(const thread_buffer& src_thread_ const bool dst_thread_element_valid, const index_t dst_element_space_size) { - const int32x4_t dst_wave_buffer_resource = - make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T)); +#if defined(__gfx942__) + if constexpr(std::is_same::value) + { + if(dst_thread_element_valid) + { + amd_global_atomic_add_impl(src_thread_data, + p_dst_wave + dst_thread_element_offset); + } + } + else + { +#endif + const int32x4_t dst_wave_buffer_resource = + make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T)); - index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T); + index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T); #if CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK - uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000; + uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000; - amd_buffer_atomic_add_impl( - src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0); + amd_buffer_atomic_add_impl( + src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0); #else if(dst_thread_element_valid) { @@ -2776,6 +2822,9 @@ CK_TILE_DEVICE void amd_buffer_atomic_add(const thread_buffer& src_thread_ src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0); } #endif +#if defined(__gfx942__) + } +#endif } template , int32_t> || std::is_same_v, float> || (std::is_same_v, half_t> && scalar_per_x_vector % 2 == 0) -#if defined(__gfx950__) // only gfx950 support atomic_pk_add_bf16 +#if defined(__gfx942__) || defined(__gfx950__) // only gfx942 and gfx950 support atomic_pk_add_bf16 || (std::is_same_v, bfloat16_t> && scalar_per_x_vector % 2 == 0) #endif @@ -642,7 +642,7 @@ struct buffer_view, float> || (std::is_same_v, half_t> && scalar_per_x_vector % 2 == 0) -#if defined(__gfx950__) // only gfx950 support atomic_pk_add_bf16 +#if defined(__gfx942__) || defined(__gfx950__) // only gfx942 and gfx950 support atomic_pk_add_bf16 || (std::is_same_v, bfloat16_t> && scalar_per_x_vector % 2 == 0) #endif diff --git a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp index 37ed8ce49a..f5166cfdcb 100644 --- a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp @@ -1021,6 +1021,11 @@ struct UniversalGemmKernel const auto& e_tensor_view = make_tensor_view(e_ptr, e_desc); + // For bf16_t and atomic_add global_atomic_add is used instead of buffer_atomic_add + // Add padding for not contiguous dim due to the lack of OOB check + constexpr bool pad_not_contiguous_dim = + std::is_same_v && DstInMemOp == memory_operation_enum::atomic_add; + // Step 2: Create padded view const auto& e_pad_view = [&]() { if constexpr(std::is_same_v) @@ -1028,14 +1033,14 @@ struct UniversalGemmKernel return pad_tensor_view(e_tensor_view, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } else { return pad_tensor_view(e_tensor_view, make_tuple(number{}, number{}), - sequence{}); + sequence{}); } }(); diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp index fb82d77fe6..801207106b 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp @@ -531,11 +531,11 @@ struct GroupedConvolutionBackwardDataKernel static constexpr index_t kBlockSize = GemmPipeline::BlockSize; - using InDataType = remove_cvref_t; + using OutDataType = remove_cvref_t; using WeiDataType = remove_cvref_t; using DsDataType = remove_cvref_t; - using OutDataType = remove_cvref_t; + using InDataType = remove_cvref_t; using GroupedConvBwdDataKernelArgsSpecialized = GroupedConvBwdDataKernelArgs; @@ -561,7 +561,7 @@ struct GroupedConvolutionBackwardDataKernel constexpr auto NumGroupsToMerge = GroupedConvTraitsType_::NumGroupsToMerge; // clang-format off return concat('_', "grouped_convolution_backward_data", - gemm_prec_str(), + gemm_prec_str(), InLayout::name, WeiLayout::name, OutLayout::name, @@ -632,7 +632,7 @@ struct GroupedConvolutionBackwardDataKernel const auto& a_pad_view = pad_tensor_view( a_tensor_view, make_tuple(number{}, number{}), - sequence{}); + sequence{}); // Step 3: Create tile window auto a_block_window = make_tile_window( @@ -644,7 +644,7 @@ struct GroupedConvolutionBackwardDataKernel } CK_TILE_DEVICE static auto - MakeBBlockWindow(const InDataType* b_ptr, + MakeBBlockWindow(const WeiDataType* b_ptr, const GroupedConvBwdDataKernelArgsSpecialized& kargs, const index_t group_id, const index_t i_n, @@ -658,7 +658,7 @@ struct GroupedConvolutionBackwardDataKernel const auto& b_pad_view = pad_tensor_view( b_tensor_view, make_tuple(number{}, number{}), - sequence{}); + sequence{}); // Step 3: Create tile window auto b_block_window = make_tile_window( @@ -681,14 +681,14 @@ struct GroupedConvolutionBackwardDataKernel [&](auto i) { // Step 1: Create tensor view for D const auto& d_tensor_view = make_tensor_view( - static_cast(ds_ptr[i]), kargs.c_grid_descs_m_n[group_id]); + static_cast(ds_ptr[i]), kargs.c_grid_descs_m_n[group_id]); // Step 2: Create padded view const auto& d_pad_view = pad_tensor_view(d_tensor_view, make_tuple(number{}, number{}), - sequence{}); + sequence{}); // Step 3: Create tile window return make_tile_window(d_pad_view, @@ -703,7 +703,7 @@ struct GroupedConvolutionBackwardDataKernel template CK_TILE_DEVICE static auto - MakeCBlockWindow(WeiDataType* c_ptr, + MakeCBlockWindow(InDataType* c_ptr, const GroupedConvBwdDataKernelArgsSpecialized& kargs, const index_t group_id, const index_t i_m, @@ -713,11 +713,20 @@ struct GroupedConvolutionBackwardDataKernel const auto& c_tensor_view = make_tensor_view( c_ptr, kargs.c_grid_descs_m_n[group_id]); + // For bf16_t and atomic_add global_atomic_add is used instead of buffer_atomic_add + // Add padding for not contiguous dim due to the lack of OOB check + // Not needed from gfx950. +#if defined(__gfx950__) + constexpr bool pad_not_contiguous_dim = false; +#else + constexpr bool pad_not_contiguous_dim = + std::is_same_v && DstInMemOp == memory_operation_enum::atomic_add; +#endif // Step 2: Create padded view const auto& c_pad_view = pad_tensor_view( c_tensor_view, make_tuple(number{}, number{}), - sequence{}); + sequence{}); // Step 3: Create tile window auto c_block_window = make_tile_window( @@ -739,7 +748,7 @@ struct GroupedConvolutionBackwardDataKernel } } if constexpr(GroupedConvTraitsType_::VectorSizeC % 2 != 0 && - is_any_of::value) + is_any_of::value) { if(kargs.k_batch != 1) { @@ -862,133 +871,6 @@ struct GroupedConvolutionBackwardDataKernel return true; } - template - CK_TILE_DEVICE static auto - MakeGemmTensorViews(const OutDataType* a_ptr, - const InDataType* b_ptr, - const std::array& ds_ptr, - WeiDataType* c_ptr, - const GroupedConvBwdDataKernelArgsSpecialized& kargs, - const index_t group_id) - { - static_assert(!GemmPipeline::BlockGemmShape::PermuteA, "Not implemented!"); - static_assert(!GemmPipeline::BlockGemmShape::PermuteB, "Not implemented!"); - const auto& a_tensor_view = [&]() { - return make_tensor_view( - a_ptr, - kargs.a_grid_descs_m_k[group_id]); // A: out - }(); - - const auto& b_tensor_view = [&]() { - return make_tensor_view( - b_ptr, - kargs.b_grid_descs_n_k[group_id]); // B: weight - }(); - - const auto& c_tensor_view = [&]() { - return make_tensor_view( - c_ptr, kargs.c_grid_descs_m_n[group_id]); - }(); - - const auto& ds_tensor_view = generate_tuple( - [&](auto i) { - static_assert(std::is_same_v, OutLayout>, - "Not supported!"); - static_assert(std::is_same_v, - "Not supported!"); - static_assert(std::is_same_v, OutDataType>, - "Not supported!"); - - return make_tensor_view( - static_cast(ds_ptr[i]), kargs.c_grid_descs_m_n[group_id]); - }, - number{}); - - return make_tuple(a_tensor_view, b_tensor_view, ds_tensor_view, c_tensor_view); - } - - template - CK_TILE_DEVICE static auto MakeGemmPadViews(const TensorView& views) - { - const auto& a_pad_view = [&]() { - const auto& a_tensor_view = views.at(I0); - return pad_tensor_view(a_tensor_view, - make_tuple(number{}, - number{}), - sequence{}); - }(); - - const auto& b_pad_view = [&]() { - const auto& b_tensor_view = views.at(I1); - return pad_tensor_view(b_tensor_view, - make_tuple(number{}, - number{}), - sequence{}); - }(); - - const auto& ds_tensor_view = views.at(I2); - const auto& ds_pad_view = generate_tuple( - [&](auto i) { - return pad_tensor_view(ds_tensor_view[i], - make_tuple(number{}, - number{}), - sequence{}); - }, - number{}); - - const auto& c_pad_view = [&]() { - const auto& c_tensor_view = views.at(I3); - return pad_tensor_view(c_tensor_view, - make_tuple(number{}, - number{}), - sequence{}); - }(); - - return make_tuple(a_pad_view, b_pad_view, ds_pad_view, c_pad_view); - } - - template - CK_TILE_DEVICE static auto MakeGemmTileWindows(const PadView& views, - const index_t i_m, - const index_t i_n, - const index_t i_k) - { - const auto& a_pad_view = views.at(I0); - const auto& b_pad_view = views.at(I1); - const auto& ds_pad_view = views.at(I2); - const auto& c_pad_view = views.at(I3); - - const auto& a_block_window = [&]() { - return make_tile_window(a_pad_view, - make_tuple(number{}, - number{}), - {i_m, i_k}); - }(); - - const auto& b_block_window = [&]() { - return make_tile_window(b_pad_view, - make_tuple(number{}, - number{}), - {i_k, i_n}); - }(); - - const auto ds_block_window = generate_tuple( - [&](auto i) { - return make_tile_window(ds_pad_view[i], - make_tuple(number{}, - number{}), - {i_m, i_n}); - }, - number{}); - - auto c_block_window = make_tile_window( - c_pad_view, - make_tuple(number{}, number{}), - {i_m, i_n}); - - return make_tuple(a_block_window, b_block_window, ds_block_window, c_block_window); - } - /** * @brief Runs single GEMM problem cooperatively by whole workgroup. * @@ -1002,9 +884,9 @@ struct GroupedConvolutionBackwardDataKernel * */ CK_TILE_DEVICE static void RunGemm(const OutDataType* a_ptr, - const InDataType* b_ptr, + const WeiDataType* b_ptr, const std::array& ds_ptr, - WeiDataType* c_ptr, + InDataType* c_ptr, void* smem_ptr_0, const GroupedConvBwdDataKernelArgsSpecialized& kargs, const index_t splitted_k, @@ -1044,7 +926,7 @@ struct GroupedConvolutionBackwardDataKernel else { if constexpr(!(GroupedConvTraitsType_::VectorSizeC % 2 != 0 && - is_any_of::value)) + is_any_of::value)) { auto c_block_window = MakeCBlockWindow( c_ptr, kargs, group_id, block_idx_m, block_idx_n); diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp index 40ace7cbbe..0c21881b1f 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp @@ -869,10 +869,19 @@ struct GroupedConvolutionBackwardWeightKernel const auto& c_tensor_view = make_tensor_view(c_ptr, kargs.c_grid_desc_m_n); + // For bf16_t and atomic_add global_atomic_add is used instead of buffer_atomic_add + // Add padding for not contiguous dim due to the lack of OOB check + // Not needed from gfx950. +#if defined(__gfx950__) + constexpr bool pad_not_contiguous_dim = false; +#else + constexpr bool pad_not_contiguous_dim = + std::is_same_v && DstInMemOp == memory_operation_enum::atomic_add; +#endif const auto& c_pad_view = pad_tensor_view( c_tensor_view, make_tuple(number{}, number{}), - sequence{}); + sequence{}); return make_tile_window( c_pad_view, @@ -905,7 +914,7 @@ struct GroupedConvolutionBackwardWeightKernel return pad_tensor_view(ds_tensor_view[i], make_tuple(number{}, number{}), - sequence{}); + sequence{}); }, number{}); @@ -933,7 +942,7 @@ struct GroupedConvolutionBackwardWeightKernel pad_tensor_view(b_tensor_view, make_tuple(number{} * kargs.k_batch, number{}), - sequence{}); + sequence{}); return make_tile_window( b_pad_view, @@ -955,7 +964,7 @@ struct GroupedConvolutionBackwardWeightKernel pad_tensor_view(a_tensor_view, make_tuple(number{} * kargs.k_batch, number{}), - sequence{}); + sequence{}); return make_tile_window( a_pad_view, diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp index 1eb0ee2022..be8fe12f1b 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp @@ -898,7 +898,7 @@ struct GroupedConvolutionForwardKernel pad_tensor_view(a_tensor_view, make_tuple(number{}, number{}), - sequence{}); + sequence{}); // Step 3: Create tile window return make_tile_window(a_pad_view, @@ -924,7 +924,7 @@ struct GroupedConvolutionForwardKernel pad_tensor_view(a_tensor_view, make_tuple(number{}, number{}), - sequence{}); + sequence{}); // Step 3: Create tile window return make_tile_window(a_pad_view, @@ -945,7 +945,7 @@ struct GroupedConvolutionForwardKernel const auto& b_pad_view = pad_tensor_view( b_tensor_view, make_tuple(number{}, number{}), - sequence{}); + sequence{}); // Step 3: Create tile window return make_tile_window( @@ -981,7 +981,7 @@ struct GroupedConvolutionForwardKernel return pad_tensor_view(ds_tensor_view[i], make_tuple(number{}, number{}), - sequence{}); + sequence{}); }, number{}); @@ -1006,11 +1006,20 @@ struct GroupedConvolutionForwardKernel const auto& c_tensor_view = make_tensor_view(c_ptr, c_desc); + // For bf16_t and atomic_add global_atomic_add is used instead of buffer_atomic_add + // Add padding for not contiguous dim due to the lack of OOB check + // Not needed from gfx950. +#if defined(__gfx950__) + constexpr bool pad_not_contiguous_dim = false; +#else + constexpr bool pad_not_contiguous_dim = + std::is_same_v && DstInMemOp == memory_operation_enum::atomic_add; +#endif // Step 2: Create padded view const auto& c_pad_view = pad_tensor_view( c_tensor_view, make_tuple(number{}, number{}), - sequence{}); + sequence{}); // Step 3: Create tile window return make_tile_window(