// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT /** * \brief Test that verifies availability of >256 VGPRs by running a kernel that uses large number * of VGPRs (~800) to perform a simple matrix multiplication operation. * * The test runs the kernel with different matrix sizes (8x8 and 16x16 per thread). The kernel * stores the input matrices in VGPRs, performs matrix multiplication, and writes the result back to * global memory. The host code verifies the correctness of the result by comparing it with a * reference implementation. * * \note This example must be compiled with the following flag to see the resource allocations: * "-Rpass-analysis=kernel-resource-usage" * * On gfx1200 with ROCm 6.4.1, the kernel will show register spilling due to limited VGPRs: * \verbatim * 8x8 matrix per thread: * SGPRs: 8 * VGPRs: 105 * ScratchSize [bytes/lane]: 0 * Dynamic Stack: False * Occupancy [waves/SIMD]: 12 * SGPRs Spill: 0 * VGPRs Spill: 0 * LDS Size [bytes/block]: 0 * * 16x16 matrix per thread: * SGPRs: 36 * VGPRs: 256 * ScratchSize [bytes/lane]: 54144 * Dynamic Stack: False * Occupancy [waves/SIMD]: 5 * SGPRs Spill: 0 * VGPRs Spill: 3771 * LDS Size [bytes/block]: 0 * \endverbatim * * On gfx1250, the test will not show register spilling due to increased VGPRs: * \verbatim * 8x8 matrix per thread: * TotalSGPRs: 8 * VGPRs: 135 * ScratchSize [bytes/lane]: 0 * Dynamic Stack: False * Occupancy [waves/SIMD]: 7 * SGPRs Spill: 0 * VGPRs Spill: 0 * LDS Size [bytes/block]: 0 * * 16x16 matrix per thread: * TotalSGPRs: 44 * VGPRs: 787 * ScratchSize [bytes/lane]: 50304 * Dynamic Stack: False * Occupancy [waves/SIMD]: 1 * SGPRs Spill: 0 * VGPRs Spill: 0 * LDS Size [bytes/block]: 0 * \endverbatim * * \note The register allocations above can be influenced by compiler version and code * changes/optimizations. */ #include "gtest/gtest.h" #include "ck/host_utility/hip_check_error.hpp" #include "ck/utility/common_header.hpp" using namespace std; template __global__ void __launch_bounds__(64, 1) test_largevgpr(const float* a, const float* b, float* c) { // store data in VGPRs typedef float mat_t __attribute__((ext_vector_type(MatSize * MatSize))); int num = hipThreadIdx_x; mat_t mata; mat_t matb; mat_t matc(0.0f); const float* p_a = a + num * MatSize * MatSize; const float* p_b = b + num * MatSize * MatSize; float* p_c = c + num * MatSize * MatSize; for(uint32_t i = 0; i < MatSize; i++) { for(uint32_t j = 0; j < MatSize; j++) { mata[i * MatSize + j] = *(p_a + i * MatSize + j); matb[i * MatSize + j] = *(p_b + i * MatSize + j); } } for(uint32_t i = 0; i < MatSize; i++) { for(uint32_t j = 0; j < MatSize; j++) { for(uint32_t k = 0; k < MatSize; k++) { matc[i * MatSize + j] += mata[i * MatSize + k] * matb[k * MatSize + j]; } } } for(uint32_t i = 0; i < MatSize; i++) { for(uint32_t j = 0; j < MatSize; j++) { *(p_c + i * MatSize + j) = matc[i * MatSize + j]; } } } template void verify_largevgpr() { std::vector a; std::vector b; std::vector c; std::vector ref; a.resize(MatSize * MatSize * 32); b.resize(MatSize * MatSize * 32); c.resize(MatSize * MatSize * 32); ref.resize(MatSize * MatSize * 32); constexpr int max_value = 7; constexpr int min_value = -7; for(size_t i = 0; i < MatSize * MatSize * 32; i++) { a[i] = static_cast((std::rand() % (max_value - min_value)) + min_value); b[i] = static_cast((std::rand() % (max_value - min_value)) + min_value); ref[i] = 0; } for(uint32_t t = 0; t < 32; t++) { const float* p_a = a.data() + t * MatSize * MatSize; const float* p_b = b.data() + t * MatSize * MatSize; float* p_ref = ref.data() + t * MatSize * MatSize; for(uint32_t i = 0; i < MatSize; i++) { for(uint32_t j = 0; j < MatSize; j++) { for(uint32_t k = 0; k < MatSize; k++) { *(p_ref + i * MatSize + j) += *(p_a + i * MatSize + k) * *(p_b + k * MatSize + j); } } } } float* device_a; float* device_b; float* device_c; HIP_CHECK_ERROR(hipMalloc(reinterpret_cast(&device_a), a.size() * sizeof(float))); HIP_CHECK_ERROR(hipMalloc(reinterpret_cast(&device_b), b.size() * sizeof(float))); HIP_CHECK_ERROR(hipMalloc(reinterpret_cast(&device_c), c.size() * sizeof(float))); HIP_CHECK_ERROR(hipMemcpy(device_a, a.data(), a.size() * sizeof(float), hipMemcpyHostToDevice)); HIP_CHECK_ERROR(hipMemcpy(device_b, b.data(), b.size() * sizeof(float), hipMemcpyHostToDevice)); hipLaunchKernelGGL( test_largevgpr, dim3(1), dim3(32), 0, nullptr, device_a, device_b, device_c); HIP_CHECK_ERROR(hipMemcpy(c.data(), device_c, c.size() * sizeof(float), hipMemcpyDeviceToHost)); bool pass = true; for(size_t i = 0; i < MatSize * MatSize * 32; i++) { if(fabs(c[i] - ref[i]) > 0.0001) { pass = false; std::cout << "mismatch on index " << i << ": " << c[i] << " != " << ref[i] << std::endl; break; } } HIP_CHECK_ERROR(hipFree(device_a)); HIP_CHECK_ERROR(hipFree(device_b)); HIP_CHECK_ERROR(hipFree(device_c)); EXPECT_TRUE(pass); } TEST(GEMMVGPR, M8x8) { verify_largevgpr<8>(); } TEST(GEMMVGPR, M16x16) { verify_largevgpr<16>(); }