Merge commit '054f85ab7c0fa07a90968e834899ec415af8b713' into develop

This commit is contained in:
assistant-librarian[bot]
2025-07-07 17:07:08 +00:00
parent 7a78fb644d
commit f8ee69963d
18 changed files with 578 additions and 95 deletions

View File

@@ -54,6 +54,8 @@ namespace device {
*
* Conditions for achieving computational load balancing on different hardware platforms can vary.
*
* \tparam KPerBlock is the number of elements in K dimension that each block processes (multiply with packed_size_v to get the actual KPerBlock)
*
* Serialized version of the algorithm:
* \code
* // E = A * B + C
@@ -117,7 +119,7 @@ template <typename ALayout,
index_t BlockSize, // Thread block size
index_t MPerBlock,
index_t NPerBlock,
index_t KPerBlock,
index_t KPerBlock, // multiply with packed_size_v to get the actual KPerBlock
index_t AK1,
index_t BK1,
index_t MPerXDL,