Add Doxygen Documentation for HostTesnor, HostTensorDescriptor, DeviceMem, FillUniformDistribution (#2160)

* added documentation for HostTensorDescriptor

* added documentation for DeviceMem and FillUniformDistribution

* fixed merging error

* fixed host_tensor_descriptor error

* clang format
This commit is contained in:
Aviral Goel
2025-05-21 12:34:30 -05:00
committed by GitHub
parent 990d645578
commit fa39c4e798
4 changed files with 139 additions and 7 deletions

View File

@@ -20,10 +20,35 @@ __global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
}
/**
* @brief Container for storing data in GPU device memory
* @brief Manages device memory allocation and host-device data transfers
*
* DeviceMem encapsulates GPU memory management operations using HIP runtime API.
* It provides functionality for allocating device memory, transferring data between
* host and device, and performing basic memory operations.
*
* Key features:
* - Automatic memory allocation and deallocation
* - Host-to-device and device-to-host data transfers
* - Memory initialization operations
* - Integration with HostTensor for simplified data handling
*
* Usage example:
* ```
* // Allocate device memory
* BHostTensor<float> AHostData({256});
* DeviceMem d_mem(BHostData.get_element_space_size_in_bytes());
*
* // Transfer data to device
* HostTensor<float> AHostTensor({256});
* d_mem.ToDevice(AHostData.data());
*
* // Retrieve data from device
* HostTensor<float> ResultHostTensor({256});
* d_mem.FromDevice(ResultHostTensor.data());
* ```
*/
struct DeviceMem
{
DeviceMem() : mpDeviceBuf(nullptr), mMemSize(0) {}
DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
@@ -163,8 +188,8 @@ struct DeviceMem
}
}
void* mpDeviceBuf;
std::size_t mMemSize;
void* mpDeviceBuf; ///< pointer to device buffer
std::size_t mMemSize; ///< size of device buffer in bytes
};
} // namespace ck_tile

View File

@@ -17,13 +17,31 @@
namespace ck_tile {
/**
* @brief Functor for filling a range with randomly generated values from a uniform distribution.
*
* This struct provides functionality to fill iterators or ranges with random values
* generated from a uniform distribution. It supports both single-threaded and
* multi-threaded operation.
*
* @tparam T The target type for the generated values.
*
* @note The multi-threaded implementation is not guaranteed to provide perfectly
* distributed values across threads.
*
* @example
*
* // Direct usage without creating a separate variable:
* ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_host_tensor);
*/
template <typename T>
struct FillUniformDistribution
{
float a_{-5.f};
float b_{5.f};
std::optional<uint32_t> seed_{11939};
// ATTENTION: threaded does not guarantee the distribution between thread
// ATTENTION: Whether to use multi-threading (note: not guaranteed to be perfectly distributed
// across threads).
bool threaded = false;
template <typename ForwardIter>

View File

@@ -85,6 +85,19 @@ CK_TILE_HOST auto construct_f_unpack_args(F, T args)
return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
}
/**
* @brief Descriptor for tensors in host memory.
*
* HostTensorDescriptor manages the shape (dimensions) and memory layout (strides)
* of a tensor in host memory. It provides functionality to:
* - Store tensor dimensions and strides
* - Calculate default strides for contiguous memory layout
* - Convert multi-dimensional indices to linear memory offsets
* - Query tensor metadata (dimensions, element counts, etc.)
*
* The class supports both automatic stride calculation for contiguous memory layout
* and custom strides for more complex memory patterns.
*/
struct HostTensorDescriptor
{
HostTensorDescriptor() = default;
@@ -138,12 +151,35 @@ struct HostTensorDescriptor
}
std::size_t get_num_of_dimension() const { return mLens.size(); }
/**
* @brief Calculates the total number of elements in the tensor.
*
* Computes the product of all dimension lengths to determine the
* total element count in the tensor.
*
* @pre The lengths array (mLens) and strides array (mStrides) must have
* the same size.
*
* @return The total number of elements in the tensor.
*/
std::size_t get_element_size() const
{
assert(mLens.size() == mStrides.size());
return std::accumulate(
mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>());
}
/**
* @brief Calculates the total element space required for the tensor in memory.
*
* This method computes the minimum size of contiguous memory needed to store
* all elements of the tensor, taking into account the tensor's dimensions and
* strides. The calculation is based on the formula: 1 + max((length_i - 1) * stride_i)
* across all dimensions.
*
* Dimensions with length 0 are skipped in this calculation.
*
* @return The size of the tensor's element space (number of elements).
*/
std::size_t get_element_space_size() const
{
std::size_t space = 1;
@@ -165,6 +201,18 @@ struct HostTensorDescriptor
const std::vector<std::size_t>& get_strides() const { return mStrides; }
/**
* @brief Calculates the linear offset from multi-dimensional indices.
*
* Converts a set of N-dimensional indices into a single linear offset by computing
* the inner product of the indices with the tensor's strides.
*
* @tparam Is Parameter pack of index types (should be convertible to std::size_t)
* @param is Variable number of indices, one for each dimension of the tensor
* @return std::size_t Linear offset corresponding to the given multi-dimensional indices
*
* @pre The number of indices must match the number of dimensions in the tensor
*/
template <typename... Is>
std::size_t GetOffsetFromMultiIndex(Is... is) const
{
@@ -173,6 +221,15 @@ struct HostTensorDescriptor
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
}
/**
* @brief Calculates the linear memory offset from a multi-dimensional index
*
* Computes the linear offset by performing an inner product between the provided
* multi-dimensional indices and the tensor's strides.
*
* @param iss Vector containing the multi-dimensional indices
* @return The calculated linear offset as a size_t
*/
std::size_t GetOffsetFromMultiIndex(std::vector<std::size_t> iss) const
{
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
@@ -194,8 +251,8 @@ struct HostTensorDescriptor
}
private:
std::vector<std::size_t> mLens;
std::vector<std::size_t> mStrides;
std::vector<std::size_t> mLens; ///< Lengths of each dimension
std::vector<std::size_t> mStrides; ///< Strides for each dimension
};
template <typename New2Old>
@@ -681,6 +738,24 @@ struct HostTensor
Data mData;
};
/**
* @brief Creates a host tensor descriptor with specified dimensions and layout
*
* Constructs a HostTensorDescriptor with appropriate strides based on whether the tensor
* layout is row-major or column-major. This is determined via the compile-time template
* parameter `is_row_major`.
*
* @tparam is_row_major Compile-time flag indicating if the layout is row-major (true) or
* column-major (false)
*
* @param row Number of rows in the tensor
* @param col Number of columns in the tensor
* @param stride Stride between adjacent rows (for row-major) or columns (for column-major)
*
* @return HostTensorDescriptor with shape {row, col} and strides:
* - For row-major: {stride, 1}
* - For column-major: {1, stride}
*/
template <bool is_row_major>
auto host_tensor_descriptor(std::size_t row,
std::size_t col,
@@ -698,6 +773,7 @@ auto host_tensor_descriptor(std::size_t row,
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
}
template <bool is_row_major>
auto get_default_stride(std::size_t row,
std::size_t col,
@@ -718,5 +794,4 @@ auto get_default_stride(std::size_t row,
else
return stride;
}
} // namespace ck_tile