diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp index 906f3f1933..42886b8ced 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp @@ -1414,7 +1414,7 @@ CK_TILE_DEVICE thread_buffer amd_buffer_load_impl(int32x4_t src_wave_buffe (std::is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32) || (std::is_same::value && - (N == 1 || N == 2 || N == 4 || N == 8 || N == 16))), + (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32))), "wrong! not implemented"); using rtn_type = thread_buffer; @@ -1713,9 +1713,8 @@ CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T* smem, ignore = src_immediate_addr_offset; #if defined(__gfx950__) - static_assert(bytes == 16, "wrong! not implemented vector size"); - // static_assert(bytes == 4 || bytes == 12 || bytes == 16, - // "wrong! only support in dword, dwordx3, dwordx4"); + static_assert(bytes == 4 || bytes == 12 || bytes == 16, + "wrong! only support in dword, dwordx3, dwordx4"); src_wave_addr_offset = 0; #else static_assert(bytes == 4, "wrong! not implemented vector size");