mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-02 20:51:23 +00:00
Add Doxygen Documentation for HostTesnor, HostTensorDescriptor, DeviceMem, FillUniformDistribution (#2160)
* added documentation for HostTensorDescriptor * added documentation for DeviceMem and FillUniformDistribution * fixed merging error * fixed host_tensor_descriptor error * clang format
This commit is contained in:
@@ -20,10 +20,35 @@ __global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Container for storing data in GPU device memory
|
||||
* @brief Manages device memory allocation and host-device data transfers
|
||||
*
|
||||
* DeviceMem encapsulates GPU memory management operations using HIP runtime API.
|
||||
* It provides functionality for allocating device memory, transferring data between
|
||||
* host and device, and performing basic memory operations.
|
||||
*
|
||||
* Key features:
|
||||
* - Automatic memory allocation and deallocation
|
||||
* - Host-to-device and device-to-host data transfers
|
||||
* - Memory initialization operations
|
||||
* - Integration with HostTensor for simplified data handling
|
||||
*
|
||||
* Usage example:
|
||||
* ```
|
||||
* // Allocate device memory
|
||||
* BHostTensor<float> AHostData({256});
|
||||
* DeviceMem d_mem(BHostData.get_element_space_size_in_bytes());
|
||||
*
|
||||
* // Transfer data to device
|
||||
* HostTensor<float> AHostTensor({256});
|
||||
* d_mem.ToDevice(AHostData.data());
|
||||
*
|
||||
* // Retrieve data from device
|
||||
* HostTensor<float> ResultHostTensor({256});
|
||||
* d_mem.FromDevice(ResultHostTensor.data());
|
||||
* ```
|
||||
*/
|
||||
struct DeviceMem
|
||||
|
||||
{
|
||||
DeviceMem() : mpDeviceBuf(nullptr), mMemSize(0) {}
|
||||
DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
|
||||
@@ -163,8 +188,8 @@ struct DeviceMem
|
||||
}
|
||||
}
|
||||
|
||||
void* mpDeviceBuf;
|
||||
std::size_t mMemSize;
|
||||
void* mpDeviceBuf; ///< pointer to device buffer
|
||||
std::size_t mMemSize; ///< size of device buffer in bytes
|
||||
};
|
||||
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -17,13 +17,31 @@
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
/**
|
||||
* @brief Functor for filling a range with randomly generated values from a uniform distribution.
|
||||
*
|
||||
* This struct provides functionality to fill iterators or ranges with random values
|
||||
* generated from a uniform distribution. It supports both single-threaded and
|
||||
* multi-threaded operation.
|
||||
*
|
||||
* @tparam T The target type for the generated values.
|
||||
*
|
||||
* @note The multi-threaded implementation is not guaranteed to provide perfectly
|
||||
* distributed values across threads.
|
||||
*
|
||||
* @example
|
||||
*
|
||||
* // Direct usage without creating a separate variable:
|
||||
* ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_host_tensor);
|
||||
*/
|
||||
template <typename T>
|
||||
struct FillUniformDistribution
|
||||
{
|
||||
float a_{-5.f};
|
||||
float b_{5.f};
|
||||
std::optional<uint32_t> seed_{11939};
|
||||
// ATTENTION: threaded does not guarantee the distribution between thread
|
||||
// ATTENTION: Whether to use multi-threading (note: not guaranteed to be perfectly distributed
|
||||
// across threads).
|
||||
bool threaded = false;
|
||||
|
||||
template <typename ForwardIter>
|
||||
|
||||
@@ -85,6 +85,19 @@ CK_TILE_HOST auto construct_f_unpack_args(F, T args)
|
||||
return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Descriptor for tensors in host memory.
|
||||
*
|
||||
* HostTensorDescriptor manages the shape (dimensions) and memory layout (strides)
|
||||
* of a tensor in host memory. It provides functionality to:
|
||||
* - Store tensor dimensions and strides
|
||||
* - Calculate default strides for contiguous memory layout
|
||||
* - Convert multi-dimensional indices to linear memory offsets
|
||||
* - Query tensor metadata (dimensions, element counts, etc.)
|
||||
*
|
||||
* The class supports both automatic stride calculation for contiguous memory layout
|
||||
* and custom strides for more complex memory patterns.
|
||||
*/
|
||||
struct HostTensorDescriptor
|
||||
{
|
||||
HostTensorDescriptor() = default;
|
||||
@@ -138,12 +151,35 @@ struct HostTensorDescriptor
|
||||
}
|
||||
|
||||
std::size_t get_num_of_dimension() const { return mLens.size(); }
|
||||
/**
|
||||
* @brief Calculates the total number of elements in the tensor.
|
||||
*
|
||||
* Computes the product of all dimension lengths to determine the
|
||||
* total element count in the tensor.
|
||||
*
|
||||
* @pre The lengths array (mLens) and strides array (mStrides) must have
|
||||
* the same size.
|
||||
*
|
||||
* @return The total number of elements in the tensor.
|
||||
*/
|
||||
std::size_t get_element_size() const
|
||||
{
|
||||
assert(mLens.size() == mStrides.size());
|
||||
return std::accumulate(
|
||||
mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>());
|
||||
}
|
||||
/**
|
||||
* @brief Calculates the total element space required for the tensor in memory.
|
||||
*
|
||||
* This method computes the minimum size of contiguous memory needed to store
|
||||
* all elements of the tensor, taking into account the tensor's dimensions and
|
||||
* strides. The calculation is based on the formula: 1 + max((length_i - 1) * stride_i)
|
||||
* across all dimensions.
|
||||
*
|
||||
* Dimensions with length 0 are skipped in this calculation.
|
||||
*
|
||||
* @return The size of the tensor's element space (number of elements).
|
||||
*/
|
||||
std::size_t get_element_space_size() const
|
||||
{
|
||||
std::size_t space = 1;
|
||||
@@ -165,6 +201,18 @@ struct HostTensorDescriptor
|
||||
|
||||
const std::vector<std::size_t>& get_strides() const { return mStrides; }
|
||||
|
||||
/**
|
||||
* @brief Calculates the linear offset from multi-dimensional indices.
|
||||
*
|
||||
* Converts a set of N-dimensional indices into a single linear offset by computing
|
||||
* the inner product of the indices with the tensor's strides.
|
||||
*
|
||||
* @tparam Is Parameter pack of index types (should be convertible to std::size_t)
|
||||
* @param is Variable number of indices, one for each dimension of the tensor
|
||||
* @return std::size_t Linear offset corresponding to the given multi-dimensional indices
|
||||
*
|
||||
* @pre The number of indices must match the number of dimensions in the tensor
|
||||
*/
|
||||
template <typename... Is>
|
||||
std::size_t GetOffsetFromMultiIndex(Is... is) const
|
||||
{
|
||||
@@ -173,6 +221,15 @@ struct HostTensorDescriptor
|
||||
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Calculates the linear memory offset from a multi-dimensional index
|
||||
*
|
||||
* Computes the linear offset by performing an inner product between the provided
|
||||
* multi-dimensional indices and the tensor's strides.
|
||||
*
|
||||
* @param iss Vector containing the multi-dimensional indices
|
||||
* @return The calculated linear offset as a size_t
|
||||
*/
|
||||
std::size_t GetOffsetFromMultiIndex(std::vector<std::size_t> iss) const
|
||||
{
|
||||
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
|
||||
@@ -194,8 +251,8 @@ struct HostTensorDescriptor
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<std::size_t> mLens;
|
||||
std::vector<std::size_t> mStrides;
|
||||
std::vector<std::size_t> mLens; ///< Lengths of each dimension
|
||||
std::vector<std::size_t> mStrides; ///< Strides for each dimension
|
||||
};
|
||||
|
||||
template <typename New2Old>
|
||||
@@ -681,6 +738,24 @@ struct HostTensor
|
||||
Data mData;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Creates a host tensor descriptor with specified dimensions and layout
|
||||
*
|
||||
* Constructs a HostTensorDescriptor with appropriate strides based on whether the tensor
|
||||
* layout is row-major or column-major. This is determined via the compile-time template
|
||||
* parameter `is_row_major`.
|
||||
*
|
||||
* @tparam is_row_major Compile-time flag indicating if the layout is row-major (true) or
|
||||
* column-major (false)
|
||||
*
|
||||
* @param row Number of rows in the tensor
|
||||
* @param col Number of columns in the tensor
|
||||
* @param stride Stride between adjacent rows (for row-major) or columns (for column-major)
|
||||
*
|
||||
* @return HostTensorDescriptor with shape {row, col} and strides:
|
||||
* - For row-major: {stride, 1}
|
||||
* - For column-major: {1, stride}
|
||||
*/
|
||||
template <bool is_row_major>
|
||||
auto host_tensor_descriptor(std::size_t row,
|
||||
std::size_t col,
|
||||
@@ -698,6 +773,7 @@ auto host_tensor_descriptor(std::size_t row,
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
}
|
||||
}
|
||||
|
||||
template <bool is_row_major>
|
||||
auto get_default_stride(std::size_t row,
|
||||
std::size_t col,
|
||||
@@ -718,5 +794,4 @@ auto get_default_stride(std::size_t row,
|
||||
else
|
||||
return stride;
|
||||
}
|
||||
|
||||
} // namespace ck_tile
|
||||
|
||||
Reference in New Issue
Block a user