diff --git a/include/ck/library/utility/fill.hpp b/include/ck/library/utility/fill.hpp index 35625d142e..4f421b4282 100644 --- a/include/ck/library/utility/fill.hpp +++ b/include/ck/library/utility/fill.hpp @@ -85,6 +85,20 @@ struct FillUniformDistributionIntegerValue } }; +/** + * @brief A functor for filling a container with a monotonically increasing or decreasing sequence. + * + * FillMonotonicSeq generates a sequence of values starting from an initial value + * and incrementing by a fixed step for each subsequent element. + * + * @tparam T The numeric type of the sequence elements. + * + * Example usage: + * ``` + * std::vector v(5); + * FillMonotonicSeq{10, 2}(v); // Fills v with {10, 12, 14, 16, 18} + * ``` + */ template struct FillMonotonicSeq { diff --git a/include/ck_tile/host/device_memory.hpp b/include/ck_tile/host/device_memory.hpp index 13684c0e24..587f38987e 100644 --- a/include/ck_tile/host/device_memory.hpp +++ b/include/ck_tile/host/device_memory.hpp @@ -20,10 +20,35 @@ __global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size) } /** - * @brief Container for storing data in GPU device memory + * @brief Manages device memory allocation and host-device data transfers * + * DeviceMem encapsulates GPU memory management operations using HIP runtime API. + * It provides functionality for allocating device memory, transferring data between + * host and device, and performing basic memory operations. + * + * Key features: + * - Automatic memory allocation and deallocation + * - Host-to-device and device-to-host data transfers + * - Memory initialization operations + * - Integration with HostTensor for simplified data handling + * + * Usage example: + * ``` + * // Allocate device memory + * BHostTensor AHostData({256}); + * DeviceMem d_mem(BHostData.get_element_space_size_in_bytes()); + * + * // Transfer data to device + * HostTensor AHostTensor({256}); + * d_mem.ToDevice(AHostData.data()); + * + * // Retrieve data from device + * HostTensor ResultHostTensor({256}); + * d_mem.FromDevice(ResultHostTensor.data()); + * ``` */ struct DeviceMem + { DeviceMem() : mpDeviceBuf(nullptr), mMemSize(0) {} DeviceMem(std::size_t mem_size) : mMemSize(mem_size) @@ -163,8 +188,8 @@ struct DeviceMem } } - void* mpDeviceBuf; - std::size_t mMemSize; + void* mpDeviceBuf; ///< pointer to device buffer + std::size_t mMemSize; ///< size of device buffer in bytes }; } // namespace ck_tile diff --git a/include/ck_tile/host/fill.hpp b/include/ck_tile/host/fill.hpp index 3f64eb28cd..4a359e031f 100644 --- a/include/ck_tile/host/fill.hpp +++ b/include/ck_tile/host/fill.hpp @@ -17,13 +17,31 @@ namespace ck_tile { +/** + * @brief Functor for filling a range with randomly generated values from a uniform distribution. + * + * This struct provides functionality to fill iterators or ranges with random values + * generated from a uniform distribution. It supports both single-threaded and + * multi-threaded operation. + * + * @tparam T The target type for the generated values. + * + * @note The multi-threaded implementation is not guaranteed to provide perfectly + * distributed values across threads. + * + * @example + * + * // Direct usage without creating a separate variable: + * ck_tile::FillUniformDistribution{-1.f, 1.f}(a_host_tensor); + */ template struct FillUniformDistribution { float a_{-5.f}; float b_{5.f}; std::optional seed_{11939}; - // ATTENTION: threaded does not guarantee the distribution between thread + // ATTENTION: Whether to use multi-threading (note: not guaranteed to be perfectly distributed + // across threads). bool threaded = false; template diff --git a/include/ck_tile/host/host_tensor.hpp b/include/ck_tile/host/host_tensor.hpp index a43877c6da..deaa158d50 100644 --- a/include/ck_tile/host/host_tensor.hpp +++ b/include/ck_tile/host/host_tensor.hpp @@ -85,6 +85,19 @@ CK_TILE_HOST auto construct_f_unpack_args(F, T args) return construct_f_unpack_args_impl(args, std::make_index_sequence{}); } +/** + * @brief Descriptor for tensors in host memory. + * + * HostTensorDescriptor manages the shape (dimensions) and memory layout (strides) + * of a tensor in host memory. It provides functionality to: + * - Store tensor dimensions and strides + * - Calculate default strides for contiguous memory layout + * - Convert multi-dimensional indices to linear memory offsets + * - Query tensor metadata (dimensions, element counts, etc.) + * + * The class supports both automatic stride calculation for contiguous memory layout + * and custom strides for more complex memory patterns. + */ struct HostTensorDescriptor { HostTensorDescriptor() = default; @@ -138,12 +151,35 @@ struct HostTensorDescriptor } std::size_t get_num_of_dimension() const { return mLens.size(); } + /** + * @brief Calculates the total number of elements in the tensor. + * + * Computes the product of all dimension lengths to determine the + * total element count in the tensor. + * + * @pre The lengths array (mLens) and strides array (mStrides) must have + * the same size. + * + * @return The total number of elements in the tensor. + */ std::size_t get_element_size() const { assert(mLens.size() == mStrides.size()); return std::accumulate( mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies()); } + /** + * @brief Calculates the total element space required for the tensor in memory. + * + * This method computes the minimum size of contiguous memory needed to store + * all elements of the tensor, taking into account the tensor's dimensions and + * strides. The calculation is based on the formula: 1 + max((length_i - 1) * stride_i) + * across all dimensions. + * + * Dimensions with length 0 are skipped in this calculation. + * + * @return The size of the tensor's element space (number of elements). + */ std::size_t get_element_space_size() const { std::size_t space = 1; @@ -165,6 +201,18 @@ struct HostTensorDescriptor const std::vector& get_strides() const { return mStrides; } + /** + * @brief Calculates the linear offset from multi-dimensional indices. + * + * Converts a set of N-dimensional indices into a single linear offset by computing + * the inner product of the indices with the tensor's strides. + * + * @tparam Is Parameter pack of index types (should be convertible to std::size_t) + * @param is Variable number of indices, one for each dimension of the tensor + * @return std::size_t Linear offset corresponding to the given multi-dimensional indices + * + * @pre The number of indices must match the number of dimensions in the tensor + */ template std::size_t GetOffsetFromMultiIndex(Is... is) const { @@ -173,6 +221,15 @@ struct HostTensorDescriptor return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0}); } + /** + * @brief Calculates the linear memory offset from a multi-dimensional index + * + * Computes the linear offset by performing an inner product between the provided + * multi-dimensional indices and the tensor's strides. + * + * @param iss Vector containing the multi-dimensional indices + * @return The calculated linear offset as a size_t + */ std::size_t GetOffsetFromMultiIndex(std::vector iss) const { return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0}); @@ -194,8 +251,8 @@ struct HostTensorDescriptor } private: - std::vector mLens; - std::vector mStrides; + std::vector mLens; ///< Lengths of each dimension + std::vector mStrides; ///< Strides for each dimension }; template @@ -681,6 +738,24 @@ struct HostTensor Data mData; }; +/** + * @brief Creates a host tensor descriptor with specified dimensions and layout + * + * Constructs a HostTensorDescriptor with appropriate strides based on whether the tensor + * layout is row-major or column-major. This is determined via the compile-time template + * parameter `is_row_major`. + * + * @tparam is_row_major Compile-time flag indicating if the layout is row-major (true) or + * column-major (false) + * + * @param row Number of rows in the tensor + * @param col Number of columns in the tensor + * @param stride Stride between adjacent rows (for row-major) or columns (for column-major) + * + * @return HostTensorDescriptor with shape {row, col} and strides: + * - For row-major: {stride, 1} + * - For column-major: {1, stride} + */ template auto host_tensor_descriptor(std::size_t row, std::size_t col, @@ -698,6 +773,7 @@ auto host_tensor_descriptor(std::size_t row, return HostTensorDescriptor({row, col}, {1_uz, stride}); } } + template auto get_default_stride(std::size_t row, std::size_t col, @@ -718,5 +794,4 @@ auto get_default_stride(std::size_t row, else return stride; } - } // namespace ck_tile