From dbdb10573ef4bbb22bc7e7485dc841e91ff34eec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Wed, 15 Jan 2025 16:41:21 +0100 Subject: [PATCH] Add rounding for float to bf16 conversion as default (#1812) * Add rounding for float to bf16 conversion * Add bhalf test * Add inf test bhalf * Refactor * update cmake * Fixes [ROCm/composable_kernel commit: 7790e8c3f781ec86385c39b9346bdf6fef0a56d3] --- example/01_gemm/CMakeLists.txt | 3 - example/01_gemm/gemm_xdl_bf16_rtn.cpp | 53 --------- include/ck/ck.hpp | 5 +- include/ck/utility/type_convert.hpp | 103 +++++++----------- .../cpu/reference_fpAintB_gemm.hpp | 38 +------ .../cpu/reference_gemm.hpp | 19 +--- .../cpu/reference_gemm_multiple_d.hpp | 24 +--- test/data_type/CMakeLists.txt | 1 + test/data_type/test_bhalf.cpp | 48 ++++++++ 9 files changed, 104 insertions(+), 190 deletions(-) delete mode 100644 example/01_gemm/gemm_xdl_bf16_rtn.cpp create mode 100644 test/data_type/test_bhalf.cpp diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt index d6df1514b8..77f15a213c 100755 --- a/example/01_gemm/CMakeLists.txt +++ b/example/01_gemm/CMakeLists.txt @@ -48,9 +48,6 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16) add_example_executable(example_gemm_xdl_bf16 gemm_xdl_bf16.cpp) add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16) -add_example_executable(example_gemm_xdl_bf16_rtn gemm_xdl_bf16_rtn.cpp) -add_example_dependencies(example_gemm_xdl example_gemm_xdl_bf16_rtn) - add_example_executable(example_gemm_xdl_int8 gemm_xdl_int8.cpp) add_example_dependencies(example_gemm_xdl example_gemm_xdl_int8) diff --git a/example/01_gemm/gemm_xdl_bf16_rtn.cpp b/example/01_gemm/gemm_xdl_bf16_rtn.cpp deleted file mode 100644 index 108c100cbd..0000000000 --- a/example/01_gemm/gemm_xdl_bf16_rtn.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. - -#include "common.hpp" - -#include "ck/utility/type_convert.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp" - -using ADataType = ck::bhalf_t; -using BDataType = ck::bhalf_t; -using CDataType = ck::bhalf_t; -using AccDataType = float; -using CShuffleDataType = float; - -using ALayout = Row; -using BLayout = Col; -using CLayout = Row; - -using AElementOp = PassThrough; -using BElementOp = PassThrough; -using CElementOp = ck::tensor_operation::element_wise::ConvertBF16RTN; - -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; - -// clang-format off -using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle -// ######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| -// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| -// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| -// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>; -// clang-format on - -using ReferenceGemmInstance = ck::tensor_operation::host:: - ReferenceGemm; - -using ReferenceComputeType = float; -using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; - -#include "run_gemm_example.inc" - -int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index 999eb0229c..db2cd1beaf 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -155,6 +155,9 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING) // LDS direct loads using inline assembly #define CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM 0 +// set rounding to nearest even as default for bf16 conversions +#define CK_USE_RNE_BF16_CONVERSION 1 + // set rounding to nearest even as default for f8 conversions #define CK_USE_SR_F8_CONVERSION 0 diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp index 9120ce62ca..d4d3f83812 100644 --- a/include/ck/utility/type_convert.hpp +++ b/include/ck/utility/type_convert.hpp @@ -14,6 +14,41 @@ namespace ck { #define __gfx94__ #endif +// Declare a template function for bf16 conversion using RTN +template +__host__ __device__ constexpr Y bf16_convert_rtn(X x); + +// Convert fp32 to bf16 with RTN if higher precision is needed +template <> +inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn(float x) +{ + // Nan check + if(x != x) + { + return uint16_t(0x7FC0); + } + + union + { + float fp32; + uint32_t int32; + } u = {x}; + + const uint32_t first_bf16_mantisa_bit = ((u.int32 >> 16) & 1); + constexpr uint32_t rounding_bias = uint32_t((1 << 15) - 1); + + return uint16_t((u.int32 + first_bf16_mantisa_bit + rounding_bias) >> 16); +} + +// convert fp16 to bfp16 via fp32 with RTN if higher precision is needed +template <> +inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn(half_t x) +{ + float x_fp32 = static_cast(x); + + return bf16_convert_rtn(x_fp32); +} + // Convert X to Y, both X and Y are non-const data types. template (bhalf_t return u.fp32; } -// convert fp32 to bfp16 +// convert fp32 to bfp16, round to nearest even template <> inline __host__ __device__ constexpr bhalf_t type_convert(float x) { - union - { - float fp32; - uint32_t int32; - } u = {x}; - +#if CK_USE_RNE_BF16_CONVERSION + return bf16_convert_rtn(x); +#else return uint16_t(u.int32 >> 16); +#endif } // convert bfp16 to fp16 via fp32 @@ -615,60 +648,4 @@ inline __host__ __device__ void array_convert(Array& y, const Array } } -// Declare a template function for bf16 conversion using RTN -template -__host__ __device__ constexpr Y bf16_convert_rtn(X x); - -// Convert fp32 to bf16 with RTN if higher precision is needed -template <> -inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn(float x) -{ - union - { - float fp32; - uint32_t int32; - } u = {x}; - - // When the exponent bits are not all 1s, then the value is zero, normal, - // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus - // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd). - // This causes the bfloat16's mantissa to be incremented by 1 if the 16 - // least significant bits of the float mantissa are greater than 0x8000, - // or if they are equal to 0x8000 and the least significant bit of the - // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when - // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already - // has the value 0x7f, then incrementing it causes it to become 0x00 and - // the exponent is incremented by one, which is the next higher FP value - // to the unrounded bfloat16 value. When the bfloat16 value is subnormal - // with an exponent of 0x00 and a mantissa of 0x7f, it may be rounded up - // to a normal value with an exponent of 0x01 and a mantissa of 0x00. - // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F, - // incrementing it causes it to become an exponent of 0xFF and a mantissa - // of 0x00, which is Inf, the next higher value to the unrounded value. - bool flag0 = ~u.int32 & 0x7f800000; - - // When all of the exponent bits are 1, the value is Inf or NaN. - // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero - // mantissa bit. Quiet NaN is indicated by the most significant mantissa - // bit being 1. Signaling NaN is indicated by the most significant - // mantissa bit being 0 but some other bit(s) being 1. If any of the - // lower 16 bits of the mantissa are 1, we set the least significant bit - // of the bfloat16 mantissa, in order to preserve signaling NaN in case - // the bfloat16's mantissa bits are all 0. - bool flag1 = !flag0 && (u.int32 & 0xffff); - - u.int32 += flag0 ? 0x7fff + ((u.int32 >> 16) & 1) : 0; // Round to nearest, round to even - u.int32 |= flag1 ? 0x10000 : 0x0; // Preserve signaling NaN - - return uint16_t(u.int32 >> 16); -} - -// convert fp16 to bfp16 via fp32 with RTN if higher precision is needed -template <> -inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn(half_t x) -{ - float x_fp32 = static_cast(x); - - return bf16_convert_rtn(x_fp32); -} } // namespace ck diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp index ac392f0906..0417cfe2a2 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -73,39 +73,9 @@ struct ReferencefpAintBGemm : public device::BaseOperator ScaleDataType v_scale; ADataType v_converted_b; - // use PassThrough instead of ConvertBF16RTN for reference calculation - if constexpr(is_same_v) - { - ck::tensor_operation::element_wise::PassThrough{}(v_a, arg.a_m_k_(m, k)); - } - else - { - arg.a_element_op_(v_a, arg.a_m_k_(m, k)); - } - - // same for B matrix - if constexpr(is_same_v) - { - ck::tensor_operation::element_wise::PassThrough{}(v_b, arg.b_k_n_(k, n)); - } - else - { - arg.b_element_op_(v_b, arg.b_k_n_(k, n)); - } - - // same for scale matrix - if constexpr(is_same_v) - { - ck::tensor_operation::element_wise::PassThrough{}(v_scale, - arg.scale_k_n_(k, n)); - } - else - { - arg.b_element_op_(v_scale, arg.scale_k_n_(k, n)); - } + arg.a_element_op_(v_a, arg.a_m_k_(m, k)); + arg.b_element_op_(v_b, arg.b_k_n_(k, n)); + arg.b_element_op_(v_scale, arg.scale_k_n_(k, n)); v_converted_b = type_convert(v_b) * v_scale; v_acc += ck::type_convert(v_a) * diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp index 8dd5d086b3..7e2482807d 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -68,13 +68,7 @@ struct ReferenceGemm : public device::BaseOperator for(int k = 0; k < K; ++k) { - // use PassThrough instead of ConvertBF16RTN for reference calculation - if constexpr(is_same_v) - { - ck::tensor_operation::element_wise::PassThrough{}(v_a, arg.a_m_k_(m, k)); - } - else if constexpr(is_same_v) + if constexpr(is_same_v) { uint8_t i4x2 = arg.a_m_k_(m, k).data; int8_t i4 = 0; @@ -89,13 +83,8 @@ struct ReferenceGemm : public device::BaseOperator { arg.a_element_op_(v_a, arg.a_m_k_(m, k)); } - // same for B matrix - if constexpr(is_same_v) - { - ck::tensor_operation::element_wise::PassThrough{}(v_b, arg.b_k_n_(k, n)); - } - else if constexpr(is_same_v) + + if constexpr(is_same_v) { uint8_t i4x2 = arg.b_k_n_(k, n).data; int8_t i4 = 0; diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp index 06b2285229..7f263db707 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -74,26 +74,8 @@ struct ReferenceGemmMultipleD : public device::BaseOperator for(int k = 0; k < K; ++k) { - // use PassThrough instead of ConvertBF16RTN for reference calculation - if constexpr(is_same_v) - { - ck::tensor_operation::element_wise::PassThrough{}(v_a, arg.a_m_k_(m, k)); - } - else - { - arg.a_element_op_(v_a, arg.a_m_k_(m, k)); - } - // same for B matrix - if constexpr(is_same_v) - { - ck::tensor_operation::element_wise::PassThrough{}(v_b, arg.b_k_n_(k, n)); - } - else - { - arg.b_element_op_(v_b, arg.b_k_n_(k, n)); - } + arg.a_element_op_(v_a, arg.a_m_k_(m, k)); + arg.b_element_op_(v_b, arg.b_k_n_(k, n)); v_acc += ck::type_convert(v_a) * ck::type_convert(v_b); diff --git a/test/data_type/CMakeLists.txt b/test/data_type/CMakeLists.txt index a9d3dad7f3..a0ba3ed974 100644 --- a/test/data_type/CMakeLists.txt +++ b/test/data_type/CMakeLists.txt @@ -49,3 +49,4 @@ if(result EQUAL 0) endif() add_gtest_executable(test_type_convert_const type_convert_const.cpp) +add_gtest_executable(test_bhalf test_bhalf.cpp) diff --git a/test/data_type/test_bhalf.cpp b/test/data_type/test_bhalf.cpp new file mode 100644 index 0000000000..cadd8c70cf --- /dev/null +++ b/test/data_type/test_bhalf.cpp @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "gtest/gtest.h" +#include "ck/utility/data_type.hpp" +#include "ck/utility/type_convert.hpp" + +using ck::bhalf_t; +using ck::type_convert; + +TEST(BHALF_T, Nan) +{ + const uint16_t binary_bhalf_nan = 0x7FC0; + const bhalf_t bhalf_nan = ck::bit_cast(binary_bhalf_nan); + EXPECT_EQ(bhalf_nan, type_convert(ck::NumericLimits::QuietNaN())); +} + +TEST(BHALF_T, Inf) +{ + const uint16_t binary_bhalf_inf = 0x7F80; + const bhalf_t bhalf_inf = ck::bit_cast(binary_bhalf_inf); + EXPECT_EQ(bhalf_inf, type_convert(ck::NumericLimits::Infinity())); +} + +TEST(BHALF_T, MantisaOverflow) +{ + const float abs_tol = std::pow(2, -7); + const uint32_t val = 0x81FFFFFF; + const float float_val = ck::bit_cast(val); + + ASSERT_NEAR(float_val, type_convert(type_convert(float_val)), abs_tol); +} + +TEST(BHALF_T, ExpOverflow) +{ + const uint32_t val = 0xFF800000; + const float float_val = ck::bit_cast(val); + ASSERT_EQ(type_convert(type_convert(float_val)), float_val); +} + +TEST(BHALF_T, MantisaExpOverflow) +{ + const uint32_t val = 0xFFFFFFFF; + const float float_val = ck::bit_cast(val); + + ASSERT_TRUE(std::isnan(float_val)); + ASSERT_TRUE(std::isnan(type_convert(type_convert(float_val)))); +}