Merge commit '9ed9539ddfcdd8de4180fb992b718b57e1cadfae' into develop

This commit is contained in:
assistant-librarian[bot]
2025-12-01 07:15:08 +00:00
parent 0dff04aa27
commit 572df7d4d1

View File

@@ -360,10 +360,12 @@ CK_TILE_DEVICE auto cast_tile(const SrcTensor& src_tensor)
(SrcTensor::get_thread_buffer_size() % 2 == 0))
return impl::cast_tile_pkrtz_fp16_fp32<DstType, SrcTensor>(src_tensor);
#endif
#if 0 // currently it causes extra spills in qr_async_vr pipeline of fmha_fwd
else if constexpr((std::is_same_v<DstType, fp16_t> || std::is_same_v<DstType, bf16_t>) &&
std::is_same_v<typename SrcTensor::DataType, float> &&
(SrcTensor::get_thread_buffer_size() % 2 == 0))
return impl::cast_tile_pk_fp16bf16_fp32<DstType, SrcTensor>(src_tensor);
#endif
#if CK_TILE_USE_SUBDWORD_TILE_CAST
else if constexpr(sizeof(DstType) < 4 || sizeof(typename SrcTensor::DataType) < 4)
return impl::cast_tile_opt_subdword<DstType, SrcTensor>(src_tensor);