diff --git a/include/ck/utility/scaled_type_convert.hpp b/include/ck/utility/scaled_type_convert.hpp index 099debc3d4..9a9c53caec 100644 --- a/include/ck/utility/scaled_type_convert.hpp +++ b/include/ck/utility/scaled_type_convert.hpp @@ -379,8 +379,7 @@ inline __host__ __device__ float2_t scaled_type_convert(e8m0_b value.f4x2_array[0] = x; float2_t tmp = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.bitwise, type_convert(scale), 0); - // intrinsic packs vector as {element1, element0}, so we should repack it as {element0, - // element1} + // permute high bits and low bits to match the order of the original vector return float2_t{tmp[1], tmp[0]}; #else float2_t ret{utils::to_float( diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp index 5f94a56636..cda607bb30 100644 --- a/include/ck/utility/type_convert.hpp +++ b/include/ck/utility/type_convert.hpp @@ -734,6 +734,7 @@ inline __host__ __device__ f4x2_t f4_convert_rne(float2_t x, float scale = 1.0f) uint32_t bitwise; f4x2_t f4x2_array[4]; } value{0}; + // permute high bits and low bits to match the order of the original vector value.bitwise = __builtin_amdgcn_cvt_scalef32_pk_fp4_f32(value.bitwise, x[1], x[0], scale, 0); return value.f4x2_array[0]; #else @@ -824,34 +825,7 @@ inline __host__ __device__ f4x2_t f4_convert_sr(float2_t x, float scale = 1.0f) uint32_t bitwise; f4x2_t f4x2_array[4]; } value{0}; - printf("%f, %f\n", x[0], x[1]); - value.bitwise = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32( - value.bitwise, float2_t{x[1], x[0]}, rng, scale, 0); - return value.f4x2_array[0]; -#else - union - { - uint32_t bitwise; - f4x2_t f4x2_array[4]; - } value{0}; - uint8_t l = utils::sat_convert_to_type_sr(x[1] / scale, rng); - uint8_t h = utils::sat_convert_to_type_sr(x[0] / scale, rng); - value.bitwise = (h << 4) | l; - return value.f4x2_array[0]; -#endif -} - -// convert vector of 2 fp32 to vector of 2 fp4 with sr -inline __host__ __device__ f4x2_t f4_convert_sr_repro(float2_t x, float scale = 1.0f) -{ - constexpr int seed = 1254739; - uint32_t rng = prand_generator(reinterpret_cast(&x), x[0]); -#if defined(__gfx950__) - union - { - uint32_t bitwise; - f4x2_t f4x2_array[4]; - } value{0}; + // permute high bits and low bits to match the order of the original vector value.bitwise = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32( value.bitwise, float2_t{x[1], x[0]}, rng, scale, 0); return value.f4x2_array[0]; @@ -981,13 +955,15 @@ inline __host__ __device__ float2_t type_convert(f4x2_t x) } value{}; value.f4x2_array[0] = x; float scale = 1.0f; - return __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.bitwise, scale, 0); + float2_t tmp = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(value.bitwise, scale, 0); + // permute high bits and low bits to match the order of the original vector + return float2_t{tmp[1], tmp[0]}; #else float2_t ret{ utils::to_float(NumericLimits::Binary_1(), - x.template AsType()[Number<0>{}].unpack<>(Number<1>{})), + x.template AsType()[Number<0>{}].unpack<>(Number<0>{})), utils::to_float(NumericLimits::Binary_1(), - x.template AsType()[Number<0>{}].unpack<>(Number<0>{}))}; + x.template AsType()[Number<0>{}].unpack<>(Number<1>{}))}; return ret; #endif } diff --git a/test/data_type/CMakeLists.txt b/test/data_type/CMakeLists.txt index 1fe005e2f5..8a0f631b39 100644 --- a/test/data_type/CMakeLists.txt +++ b/test/data_type/CMakeLists.txt @@ -81,12 +81,6 @@ if(GPU_TARGETS MATCHES "gfx950") endif() add_dependencies(test_mx_data_types test_mx_fp4) - add_gtest_executable(test_mx_fp4_repro test_mx_fp4_repro.cpp) - if(result EQUAL 0) - target_link_libraries(test_mx_fp4_repro PRIVATE utility) - endif() - add_dependencies(test_mx_data_types test_mx_fp4_repro) - add_gtest_executable(test_e8m0 test_e8m0.cpp) if(result EQUAL 0) target_link_libraries(test_e8m0 PRIVATE utility) diff --git a/test/data_type/test_mx_fp4_repro.cpp b/test/data_type/test_mx_fp4_repro.cpp deleted file mode 100644 index f705b774c4..0000000000 --- a/test/data_type/test_mx_fp4_repro.cpp +++ /dev/null @@ -1,142 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. - -#include "gtest/gtest.h" -#include "ck/library/utility/device_memory.hpp" -#include "ck/utility/scaled_type_convert.hpp" - -using ck::e8m0_bexp_t; -using ck::float16_t; -using ck::float2_t; -using ck::float32_t; -using ck::scaled_type_convert; -using ck::type_convert; - -using ck::f4_convert_rne; -using ck::f4_convert_sr; -using ck::f4_t; -using ck::f4x16_t; -using ck::f4x2_pk_t; -using ck::f4x2_t; -using ck::f4x32_t; - -__host__ __device__ void test_mx_fp4_to_fp32(float* p_test) -{ - /// Test vector conversions - // f4x2 -> f32x2 - f4x2_t f4x2{f4x2_t::data_v{0b00011100}}; // 0b0001(=0.5) and 0b1100(=-2.0) - auto scale2 = e8m0_bexp_t(2.0f); - - float2_t f32x2 = scaled_type_convert(scale2, f4x2); - p_test[0] = f32x2[0]; - p_test[1] = f32x2[1]; -} - -__global__ void run_test_mx_fp4_to_fp32(float* p_test) { test_mx_fp4_to_fp32(p_test); } - -__host__ __device__ void test_mx_fp32_to_fp4_rne(float* p_test) -{ - // f32x2 -> f4x2 - float2_t f32x2 = {1.0f, -4.0f}; - auto scale2 = e8m0_bexp_t(2.0f); - f4x2_t f4x2 = f4_convert_rne(f32x2, type_convert(scale2)); // expect {0.5, -2} - - p_test[0] = type_convert( - f4_t(f4x2.AsType()(ck::Number<0>{}).unpack<>(ck::Number<0>{}))); // 0.5f - p_test[1] = type_convert( - f4_t(f4x2.AsType()(ck::Number<0>{}).unpack<>(ck::Number<1>{}))); // -2.0f -} - -__global__ void run_test_mx_fp32_to_fp4_rne(float* p_test) { test_mx_fp32_to_fp4_rne(p_test); } - -__host__ __device__ void test_mx_fp32_to_fp4_sr(float* p_test) -{ - float2_t f32x2 = {1.0f, -4.0f}; - auto scale2 = e8m0_bexp_t(2.0f); - f4x2_t f4x2 = f4_convert_sr(f32x2, type_convert(scale2)); // expect {0.5, -2} - - p_test[0] = type_convert( - f4_t(f4x2.AsType()(ck::Number<0>{}).unpack<>(ck::Number<0>{}))); // 0.5f - p_test[1] = type_convert( - f4_t(f4x2.AsType()(ck::Number<0>{}).unpack<>(ck::Number<1>{}))); // -2.0f -} - -__global__ void run_test_mx_fp32_to_fp4_sr(float* p_test) { test_mx_fp32_to_fp4_sr(p_test); } - -__host__ __device__ void test_mx_fp32_to_fp4_sr_failing(float* p_test) -{ - float2_t f32x2 = {1.0f, -4.0f}; - auto scale2 = e8m0_bexp_t(2.0f); - f4x2_t f4x2 = ck::f4_convert_sr_repro(f32x2, type_convert(scale2)); // expect {0.5, -2} - - p_test[0] = type_convert( - f4_t(f4x2.AsType()(ck::Number<0>{}).unpack<>(ck::Number<0>{}))); // 0.5f - p_test[1] = type_convert( - f4_t(f4x2.AsType()(ck::Number<0>{}).unpack<>(ck::Number<1>{}))); // -2.0f -} - -__global__ void run_test_mx_fp32_to_fp4_sr_failing(float* p_test) -{ - test_mx_fp32_to_fp4_sr_failing(p_test); -} - -TEST(MXFP4, FP4ToFP32) -{ - std::vector out(2, -1.0f); - - DeviceMem device_out(2 * sizeof(float)); - - run_test_mx_fp4_to_fp32<<<1, 1>>>(static_cast(device_out.GetDeviceBuffer())); - - device_out.FromDevice(out.data()); - - // f4x2 -> f32x2 - EXPECT_EQ(out[0], 1.0f); - EXPECT_EQ(out[1], -4.0f); -} - -TEST(MXFP4, FP32ToFP4RNE) -{ - std::vector out(2, -1.0f); - - DeviceMem device_out(2 * sizeof(float)); - - run_test_mx_fp32_to_fp4_rne<<<1, 1>>>(static_cast(device_out.GetDeviceBuffer())); - - device_out.FromDevice(out.data()); - - // f32x2 -> f4x2 - // RNE - EXPECT_EQ(out[0], 0.5f); - EXPECT_EQ(out[1], -2.0f); -} - -TEST(MXFP4, FP32ToFP4SR) -{ - std::vector out(2, -1.0f); - - DeviceMem device_out(2 * sizeof(float)); - - run_test_mx_fp32_to_fp4_sr<<<1, 1>>>(static_cast(device_out.GetDeviceBuffer())); - - device_out.FromDevice(out.data()); - - // SR - EXPECT_EQ(out[0], 0.5f); - EXPECT_EQ(out[1], -2.0f); -} - -TEST(MXFP4, FP32ToFP4SRFailing) -{ - std::vector out(2, -1.0f); - - DeviceMem device_out(2 * sizeof(float)); - - run_test_mx_fp32_to_fp4_sr_failing<<<1, 1>>>(static_cast(device_out.GetDeviceBuffer())); - - device_out.FromDevice(out.data()); - - // SR - EXPECT_EQ(out[0], 0.5f); - EXPECT_EQ(out[1], -2.0f); -}