mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
Improve fmha_bwd tests performance (#2376)
* Avoid passing indices (std::vector) by value to host tensor's operator()
Each access requires 2 allocations and copies of the vector.
* Remove 1 unneeded vector copy from the slowest part of fmha_bwd's verification
* Compute ds_hp_host_ref in parallel
This sequntial ForEach is the slowest part of validation and it benefits
from parallel computation.
* Do not use ForEach for simple copy and conversion of large tensors
These tensors all have the same shape {nhead, real_seqlen_q, real_seqlen_k} and
can be copied/converted without complex computations of linear indices.
This commit is contained in:
@@ -167,7 +167,7 @@ struct HostTensorDescriptor
|
||||
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
|
||||
}
|
||||
|
||||
std::size_t GetOffsetFromMultiIndex(std::vector<std::size_t> iss) const
|
||||
std::size_t GetOffsetFromMultiIndex(const std::vector<std::size_t>& iss) const
|
||||
{
|
||||
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
|
||||
}
|
||||
@@ -600,12 +600,12 @@ struct Tensor
|
||||
ck::packed_size_v<ck::remove_cvref_t<T>>];
|
||||
}
|
||||
|
||||
T& operator()(std::vector<std::size_t> idx)
|
||||
T& operator()(const std::vector<std::size_t>& idx)
|
||||
{
|
||||
return mData[mDesc.GetOffsetFromMultiIndex(idx) / ck::packed_size_v<ck::remove_cvref_t<T>>];
|
||||
}
|
||||
|
||||
const T& operator()(std::vector<std::size_t> idx) const
|
||||
const T& operator()(const std::vector<std::size_t>& idx) const
|
||||
{
|
||||
return mData[mDesc.GetOffsetFromMultiIndex(idx) / ck::packed_size_v<ck::remove_cvref_t<T>>];
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user