Improve fmha_bwd tests performance (#2376)

* Avoid passing indices (std::vector) by value to host tensor's operator()

Each access requires 2 allocations and copies of the vector.

* Remove 1 unneeded vector copy from the slowest part of fmha_bwd's verification

* Compute ds_hp_host_ref in parallel

This sequntial ForEach is the slowest part of validation and it benefits
from parallel computation.

* Do not use ForEach for simple copy and conversion of large tensors

These tensors all have the same shape {nhead, real_seqlen_q, real_seqlen_k} and
can be copied/converted without complex computations of linear indices.
This commit is contained in:
Anton Gorenko
2025-06-24 20:45:24 +06:00
committed by GitHub
parent 87fdb368a7
commit 77123600ee
3 changed files with 29 additions and 33 deletions

View File

@@ -230,7 +230,7 @@ struct HostTensorDescriptor
* @param iss Vector containing the multi-dimensional indices
* @return The calculated linear offset as a size_t
*/
std::size_t GetOffsetFromMultiIndex(std::vector<std::size_t> iss) const
std::size_t GetOffsetFromMultiIndex(const std::vector<std::size_t>& iss) const
{
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
}
@@ -540,9 +540,12 @@ struct HostTensor
return mData[GetOffsetFromMultiIndex(is...)];
}
T& operator()(std::vector<std::size_t> idx) { return mData[GetOffsetFromMultiIndex(idx)]; }
T& operator()(const std::vector<std::size_t>& idx)
{
return mData[GetOffsetFromMultiIndex(idx)];
}
const T& operator()(std::vector<std::size_t> idx) const
const T& operator()(const std::vector<std::size_t>& idx) const
{
return mData[GetOffsetFromMultiIndex(idx)];
}