// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT #pragma once #include #include #include #include "ck_tile/host/hip_check_error.hpp" #include "ck_tile/host/host_tensor.hpp" namespace ck_tile { template __global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size) { for(uint64_t i = threadIdx.x; i < buffer_element_size; i += blockDim.x) { p[i] = x; } } /** * @brief Manages device memory allocation and host-device data transfers * * DeviceMem encapsulates GPU memory management operations using HIP runtime API. * It provides functionality for allocating device memory, transferring data between * host and device, and performing basic memory operations. * * Key features: * - Automatic memory allocation and deallocation * - Host-to-device and device-to-host data transfers * - Memory initialization operations * - Integration with HostTensor for simplified data handling * * Usage example: * ``` * // Allocate device memory * BHostTensor AHostData({256}); * DeviceMem d_mem(BHostData.get_element_space_size_in_bytes()); * * // Transfer data to device * HostTensor AHostTensor({256}); * d_mem.ToDevice(AHostData.data()); * * // Retrieve data from device * HostTensor ResultHostTensor({256}); * d_mem.FromDevice(ResultHostTensor.data()); * ``` */ struct DeviceMem { DeviceMem() : mpDeviceBuf(nullptr), mMemSize(0) {} DeviceMem(std::size_t mem_size) : mMemSize(mem_size) { if(mMemSize != 0) { HIP_CHECK_ERROR(hipMalloc(static_cast(&mpDeviceBuf), mMemSize)); } else { mpDeviceBuf = nullptr; } } template DeviceMem(const HostTensor& t) : mMemSize(t.get_element_space_size_in_bytes()) { if(mMemSize != 0) { HIP_CHECK_ERROR(hipMalloc(static_cast(&mpDeviceBuf), mMemSize)); } else { mpDeviceBuf = nullptr; } ToDevice(t.data()); } void Realloc(std::size_t mem_size) { if(mpDeviceBuf) { HIP_CHECK_ERROR(hipFree(mpDeviceBuf)); } mMemSize = mem_size; if(mMemSize != 0) { HIP_CHECK_ERROR(hipMalloc(static_cast(&mpDeviceBuf), mMemSize)); } else { mpDeviceBuf = nullptr; } } void* GetDeviceBuffer() const { return mpDeviceBuf; } std::size_t GetBufferSize() const { return mMemSize; } void ToDevice(const void* p) const { if(mpDeviceBuf) { HIP_CHECK_ERROR( hipMemcpy(mpDeviceBuf, const_cast(p), mMemSize, hipMemcpyHostToDevice)); } // else // { // throw std::runtime_error("ToDevice with an empty pointer"); // } } void ToDevice(const void* p, const std::size_t cpySize) const { if(mpDeviceBuf) { HIP_CHECK_ERROR( hipMemcpy(mpDeviceBuf, const_cast(p), cpySize, hipMemcpyHostToDevice)); } } void FromDevice(void* p) const { if(mpDeviceBuf) { HIP_CHECK_ERROR(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost)); } // else // { // throw std::runtime_error("FromDevice with an empty pointer"); // } } void FromDevice(void* p, const std::size_t cpySize) const { if(mpDeviceBuf) { HIP_CHECK_ERROR(hipMemcpy(p, mpDeviceBuf, cpySize, hipMemcpyDeviceToHost)); } } // construct a host tensor with type T template HostTensor ToHost(std::size_t cpySize) { // TODO: host tensor could be slightly larger than the device tensor // we just copy all data from GPU buffer std::size_t host_elements = (cpySize + sizeof(T) - 1) / sizeof(T); HostTensor h_({host_elements}); if(mpDeviceBuf) { HIP_CHECK_ERROR(hipMemcpy(h_.data(), mpDeviceBuf, cpySize, hipMemcpyDeviceToHost)); } return h_; } template HostTensor ToHost() { return ToHost(mMemSize); } void SetZero() const { if(mpDeviceBuf) { HIP_CHECK_ERROR(hipMemset(mpDeviceBuf, 0, mMemSize)); } } template void SetValue(T x) const { if(mpDeviceBuf) { if(mMemSize % sizeof(T) != 0) { throw std::runtime_error("wrong! not entire DeviceMem will be set"); } // TODO: call a gpu kernel to set the value (?) set_buffer_value<<<1, 1024>>>(static_cast(mpDeviceBuf), x, mMemSize / sizeof(T)); } } ~DeviceMem() { if(mpDeviceBuf) { try { HIP_CHECK_ERROR(hipFree(mpDeviceBuf)); } catch(std::runtime_error& re) { std::cerr << re.what() << std::endl; } } } void* mpDeviceBuf; ///< pointer to device buffer std::size_t mMemSize; ///< size of device buffer in bytes }; } // namespace ck_tile