From e971702a8c9eb920197a56dcfeda5c6cf58502bc Mon Sep 17 00:00:00 2001 From: Rostyslav Geyyer Date: Wed, 19 Feb 2025 20:54:42 +0000 Subject: [PATCH] Pack conversion in a loop --- include/ck/utility/scaled_type_convert.hpp | 70 +++------------------- 1 file changed, 7 insertions(+), 63 deletions(-) diff --git a/include/ck/utility/scaled_type_convert.hpp b/include/ck/utility/scaled_type_convert.hpp index b810c881b2..51a08e4019 100644 --- a/include/ck/utility/scaled_type_convert.hpp +++ b/include/ck/utility/scaled_type_convert.hpp @@ -405,70 +405,14 @@ inline __host__ __device__ float32_t scaled_type_convert(e8m value.f4x32_array = x; float2_t op; float32_t ret; - // TODO: pack in a loop - op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[0], type_convert(scale), 0); - ret[0] = op[1]; - ret[1] = op[0]; + float f_scale = type_convert(scale); - op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[1], type_convert(scale), 0); - ret[2] = op[1]; - ret[3] = op[0]; - - op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[2], type_convert(scale), 0); - ret[4] = op[1]; - ret[5] = op[0]; - - op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[3], type_convert(scale), 0); - ret[6] = op[1]; - ret[7] = op[0]; - - op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[4], type_convert(scale), 0); - ret[8] = op[1]; - ret[9] = op[0]; - - op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[5], type_convert(scale), 0); - ret[10] = op[1]; - ret[11] = op[0]; - - op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[6], type_convert(scale), 0); - ret[12] = op[1]; - ret[13] = op[0]; - - op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[7], type_convert(scale), 0); - ret[14] = op[1]; - ret[15] = op[0]; - - op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[8], type_convert(scale), 0); - ret[16] = op[1]; - ret[17] = op[0]; - - op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[9], type_convert(scale), 0); - ret[18] = op[1]; - ret[19] = op[0]; - - op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[10], type_convert(scale), 0); - ret[20] = op[1]; - ret[21] = op[0]; - - op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[11], type_convert(scale), 0); - ret[22] = op[1]; - ret[23] = op[0]; - - op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[12], type_convert(scale), 0); - ret[24] = op[1]; - ret[25] = op[0]; - - op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[13], type_convert(scale), 0); - ret[26] = op[1]; - ret[27] = op[0]; - - op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[14], type_convert(scale), 0); - ret[28] = op[1]; - ret[29] = op[0]; - - op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[15], type_convert(scale), 0); - ret[30] = op[1]; - ret[31] = op[0]; + ck::static_for<0, 32 / 2, 1>{}([&](auto idx) { + op = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.fp4x2[idx], f_scale, 0); + // permute high bits and low bits to match the order of the original vector + ret[2 * idx] = op[1]; + ret[2 * idx + 1] = op[0]; + }); return ret; #else