diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index 5c54f6cda2..c37af77ad4 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -413,7 +413,8 @@ struct buffer_store_if<8> { static_assert(sizeof(T) == 8); auto save_exec = __builtin_amdgcn_read_exec(); - using mbuf_t = fp32x2_t; + // TODO: ugly. rocm-6.0/6.1 seems neet bit_cast to same base type to avoid scratch + using mbuf_t = ext_vector_t; asm volatile("v_cmpx_le_u32 exec, 1, %5\n" "buffer_store_dwordx2 %0, %1, %2, %3 offen offset:%4\n" "s_mov_b64 exec %6" diff --git a/include/ck_tile/core/numeric/half.hpp b/include/ck_tile/core/numeric/half.hpp index 4a01a5a985..60ef6c978e 100644 --- a/include/ck_tile/core/numeric/half.hpp +++ b/include/ck_tile/core/numeric/half.hpp @@ -108,8 +108,8 @@ double fp16_to_double_hip(const fp16_hip_t& x) { return static_cast(fp16 CK_TILE_HOST_DEVICE fp16_hip_t float_to_fp16_hip(const float& x) { - // return __float2half(x); - return static_cast(x); + return __float2half(x); + // return static_cast(x); } CK_TILE_HOST_DEVICE