From 15624fd6ea7cd7165865a8eef17269ac930bb410 Mon Sep 17 00:00:00 2001 From: ZheWang Date: Wed, 17 Dec 2025 07:01:42 +0000 Subject: [PATCH] add hip test --- .../ck_tile/18_flatmm/mxgemm/mx_flatmm.cpp | 12 +- hip_test.cpp | 219 ++++++++++++++++++ include/ck_tile/core/numeric/pk_fp6.hpp | 2 +- 3 files changed, 226 insertions(+), 7 deletions(-) create mode 100644 hip_test.cpp diff --git a/example/ck_tile/18_flatmm/mxgemm/mx_flatmm.cpp b/example/ck_tile/18_flatmm/mxgemm/mx_flatmm.cpp index 2c12119651..4ba11e0ed2 100644 --- a/example/ck_tile/18_flatmm/mxgemm/mx_flatmm.cpp +++ b/example/ck_tile/18_flatmm/mxgemm/mx_flatmm.cpp @@ -205,7 +205,6 @@ auto preShuffleWeight(ck_tile::HostTensor& src) int outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane + k1 * KPack * NLane + n1 * KPack + k2; - std::cout << k << " " << n << " " << outputIndex << std::endl; shuffled(outputIndex) = src(k, n); } } @@ -350,9 +349,9 @@ int main(int argc, char* argv[]) // using BDataType = ck_tile::pk_fp4_t; using BDataType = ck_tile::f6x16_pk_t; - ck_tile::index_t stride_B = 0; ck_tile::index_t N = 32; - ck_tile::index_t K = 256; + ck_tile::index_t K = 128; + ck_tile::index_t stride_B = 0; stride_B = ck_tile::get_default_stride(K, N, stride_B, ck_tile::bool_constant{}); // is_row_major ck_tile::HostTensor b_origin_host( @@ -366,8 +365,9 @@ int main(int argc, char* argv[]) { for(int k_ = 0; k_ < pack_k; k_++) { - int value = n * K + k + k_; - b_origin_host(n, k).pack(value, k_); + int value = rand() & 0x3f; + std::cout << value << std::endl; + b_origin_host(k, n).pack(value, k_); } } } @@ -377,7 +377,7 @@ int main(int argc, char* argv[]) { for(int k_ = 0; k_ < pack_k; k_++) { - std::cout << b_origin_host(n, k).unpack(k_) << std::endl; + std::cout << b_origin_host(k, n).unpack(k_) << std::endl; } } } diff --git a/hip_test.cpp b/hip_test.cpp new file mode 100644 index 0000000000..f97c1663b4 --- /dev/null +++ b/hip_test.cpp @@ -0,0 +1,219 @@ +#include + +#include +#include + +// clang-format off +// /opt/rocm/llvm/bin/clang++ -O3 -x hip --save-temps --offload-arch=gfx950 -o test-f8f4 test-f8f4.cpp && ./test-f8f4 +// clang-format on + +#define HIP_CHECK(call) \ + do \ + { \ + hipError_t err = call; \ + if(err != hipSuccess) \ + { \ + printf("HIP error %s:%d: '%s'\n", __FILE__, __LINE__, hipGetErrorString(err)); \ + exit(1); \ + } \ + } while(0) + +using fp16_t = _Float16; + +template +struct pk_f6_t +{ + static constexpr int num_bits_elem = 6; + using element_type = uint32_t; // element storage fundamental type + static constexpr int packed_size = pk_size; + static constexpr int num_bits_vec_elem = sizeof(element_type) * 8; // 32-bit uint for storage + static_assert((packed_size * num_bits_elem) % num_bits_vec_elem == 0, + "Packed elements must fit exactly into the element storage."); + static constexpr int vector_size = (packed_size * num_bits_elem) / num_bits_vec_elem; + // using storage_type = element_type __attribute__((ext_vector_type(vector_size))); + // storage_type data_{storage_type(0)}; // packed data + element_type data_[vector_size]; // packed data + using type = pk_f6_t; + void pack(const uint32_t x, const int i) + { + uint32_t bits = static_cast(x) & 0x3F; + const int bit_pos = i * num_bits_elem; + const int arr_index = bit_pos / num_bits_vec_elem; + const int bit_offset = bit_pos % num_bits_vec_elem; + const int overhang = bit_offset + num_bits_elem - num_bits_vec_elem; + uint32_t old_value = data_[arr_index]; + + // insert bits into the current 32-bit block + old_value |= (bits << bit_offset); + data_[arr_index] = old_value; + + // if it crosses into the next block, shift the remainder + if(overhang > 0 && (arr_index + 1) < vector_size) + { + uint32_t next_value = data_[arr_index + 1]; + next_value |= (bits >> (num_bits_elem - overhang)); + data_[arr_index + 1] = next_value; + } + } + + template + static inline uint32_t unpack(const type& pk, const int i) + { + const int bit_pos = i * num_bits_elem; + const int arr_idx = bit_pos / num_bits_vec_elem; + const int bit_offset = bit_pos % num_bits_vec_elem; + const int overhang = bit_offset + num_bits_elem - num_bits_vec_elem; + + uint32_t bits = pk.data_[arr_idx] >> bit_offset; + if(overhang > 0 && (arr_idx + 1) < vector_size) + { + bits |= (pk.data_[arr_idx + 1] & ((1u << overhang) - 1)) << (num_bits_elem - overhang); + } + + return bits & 0x3F; + } + + inline uint32_t unpack(const int i) const { return unpack(*this, i); } + + static float fp6_e2m3_to_float(uint32_t fp6_bits) + { + fp6_bits = fp6_bits & 0x3F; + + uint32_t sign = (fp6_bits >> 5) & 0x1; // bit 5 + uint32_t exponent = (fp6_bits >> 3) & 0x3; // bits 4-3 + uint32_t mantissa = fp6_bits & 0x7; // bits 2-0 + + float result; + if(exponent == 0 && mantissa == 0) + { + result = 0.f; + } + else if(exponent != 0) + { + result = std::pow(2, exponent - 1); + float mantissa_value = 1.0f + mantissa / 8.0f; + result *= mantissa_value; + } + else + { + result = mantissa / 8.0f; + } + return sign == 1 ? -1 * result : result; + } +}; + +using f6x16_pk_t = pk_f6_t<16>; + +__global__ void kernel1(const int32_t* a, const int32_t* b, float* c) +{ + const int l = threadIdx.x; + using i32x8_t = int32_t __attribute__((ext_vector_type(8))); + int k_dim_offset = l / 16 * 6; + int mn_dim_offset = l % 16; + int total_k_dim_dw_size = 128 * 6 / 8 / 4; + int thr_base_offset = mn_dim_offset * total_k_dim_dw_size + k_dim_offset; + // clang-format off + i32x8_t a_vec{a[thr_base_offset],a[thr_base_offset+1],a[thr_base_offset+2],a[thr_base_offset+3],a[thr_base_offset+4],a[thr_base_offset+5],0,0}; + i32x8_t b_vec{b[thr_base_offset],b[thr_base_offset+1],b[thr_base_offset+2],b[thr_base_offset+3],b[thr_base_offset+4],b[thr_base_offset+5],0,0}; + // clang-format on + + // printf("thread_idx: %d, base_offset: %d, value: %d %d %d %d %d %d\n", + // l, + // thr_base_offset, + // a[thr_base_offset], + // a[thr_base_offset + 1], + // a[thr_base_offset + 2], + // a[thr_base_offset + 3], + // a[thr_base_offset + 4], + // a[thr_base_offset + 5]); + + using fp32x4_t = float __attribute__((ext_vector_type(4))); + fp32x4_t c_vec{0}; + c_vec = + __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a_vec, b_vec, c_vec, 2, 2, 0, 127, 0, 127); + + // printf("thread_idx: %d, base_offset: %d, float value: %f %f %f %f\n", + // l, + // thr_base_offset, + // c_vec[0], + // c_vec[1], + // c_vec[2], + // c_vec[3]); + int c_m = l % 16; + int c_n = l / 16 * 4; + c[c_m * 16 + c_n + 0] = c_vec[0], c[c_m * 16 + c_n + 1] = c_vec[1]; + c[c_m * 16 + c_n + 2] = c_vec[2], c[c_m * 16 + c_n + 3] = c_vec[3]; +} + +int main(int argc, char const* argv[]) +{ + + f6x16_pk_t h_a[16 * (128 / 16)]; + f6x16_pk_t h_b[16 * (128 / 16)]; + + float ref_a[16 * 128]; + float ref_b[16 * 128]; + std::vector h_c(16 * 16); + std::vector h_cc(16 * 16); + + for(int i = 0; i < 16; i++) + { + for(int j = 0; j < 128; j += 16) + { + for(int k = 0; k < 16; k++) + { + uint32_t value = rand() & 0x3f; + h_a[i * (128 / 16) + j / 16].pack(value, k); + h_b[i * (128 / 16) + j / 16].pack(value, k); + ref_a[i * 128 + j + k] = f6x16_pk_t::fp6_e2m3_to_float(value); + ref_b[i * 128 + j + k] = f6x16_pk_t::fp6_e2m3_to_float(value); + // std::cout << ref_a[i * 128 + j + k] << "vs" + // << f6x16_pk_t::fp6_e2m3_to_float(h_a[i * (128 / 16) + j / + // 16].unpack(k)) + // << std::endl; + } + } + } + + for(int m = 0; m < 16; m++) + { + for(int n = 0; n < 16; n++) + { + h_c[m * 16 + n] = 0; + for(int k = 0; k < 128; k++) + { + h_c[m * 16 + n] += ref_a[m * 128 + k] * ref_b[n * 128 + k]; + } + // std::cout << h_c[m * 16 + n] << " "; + } + // std::cout << std::endl; + } + + int32_t* d_a; + int32_t* d_b; + float* d_c; + + HIP_CHECK(hipMalloc(&d_a, 16 * 128 / 16 * sizeof(f6x16_pk_t))); + HIP_CHECK(hipMalloc(&d_b, 16 * 128 / 16 * sizeof(f6x16_pk_t))); + HIP_CHECK(hipMalloc(&d_c, 16 * 16 * sizeof(float))); + + HIP_CHECK(hipMemcpy(d_a, h_a, 16 * 128 / 16 * sizeof(f6x16_pk_t), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_b, h_b, 16 * 128 / 16 * sizeof(f6x16_pk_t), hipMemcpyHostToDevice)); + HIP_CHECK(hipMemset(d_c, 0, 16 * 16 * sizeof(float))); + + kernel1<<<1, 64>>>(d_a, d_b, d_c); + HIP_CHECK(hipGetLastError()); + + HIP_CHECK(hipMemcpy(h_cc.data(), d_c, 16 * 16 * sizeof(float), hipMemcpyDeviceToHost)); + + HIP_CHECK(hipFree(d_a)); + HIP_CHECK(hipFree(d_b)); + HIP_CHECK(hipFree(d_c)); + + for(int i = 0; i < 16 * 16; i++) + { + std::cout << h_c[i] << "vs" << static_cast(h_cc[i]) << std::endl; + // printf("%d: %f\n", i, static_cast(h_c[i])); + } + return 0; +} \ No newline at end of file diff --git a/include/ck_tile/core/numeric/pk_fp6.hpp b/include/ck_tile/core/numeric/pk_fp6.hpp index a38f286729..51ad6798f2 100644 --- a/include/ck_tile/core/numeric/pk_fp6.hpp +++ b/include/ck_tile/core/numeric/pk_fp6.hpp @@ -22,7 +22,7 @@ struct pk_f6_t static constexpr index_t vector_size = (packed_size * num_bits_elem) / num_bits_vec_elem; // using storage_type = element_type __attribute__((ext_vector_type(vector_size))); // storage_type data_{storage_type(0)}; // packed data - element_type data_[3]; // packed data + element_type data_[vector_size]; // packed data using type = pk_f6_t; void pack(const uint32_t x, const index_t i) {