Improve fmha_bwd tests performance (#2376)

* Avoid passing indices (std::vector) by value to host tensor's operator() Each access requires 2 allocations and copies of the vector. * Remove 1 unneeded vector copy from the slowest part of fmha_bwd's verification * Compute ds_hp_host_ref in parallel This sequntial ForEach is the slowest part of validation and it benefits from parallel computation. * Do not use ForEach for simple copy and conversion of large tensors These tensors all have the same shape {nhead, real_seqlen_q, real_seqlen_k} and can be copied/converted without complex computations of linear indices.
2026-04-19 22:39:03 +00:00 · 2025-06-24 20:45:24 +06:00
parent 87fdb368a7
commit 77123600ee
3 changed files with 29 additions and 33 deletions
--- a/include/ck/library/utility/host_tensor.hpp
+++ b/include/ck/library/utility/host_tensor.hpp
@@ -167,7 +167,7 @@ struct HostTensorDescriptor
        return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
    }

-    std::size_t GetOffsetFromMultiIndex(std::vector<std::size_t> iss) const
+    std::size_t GetOffsetFromMultiIndex(const std::vector<std::size_t>& iss) const
    {
        return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
    }
@@ -600,12 +600,12 @@ struct Tensor
                     ck::packed_size_v<ck::remove_cvref_t<T>>];
    }

-    T& operator()(std::vector<std::size_t> idx)
+    T& operator()(const std::vector<std::size_t>& idx)
    {
        return mData[mDesc.GetOffsetFromMultiIndex(idx) / ck::packed_size_v<ck::remove_cvref_t<T>>];
    }

-    const T& operator()(std::vector<std::size_t> idx) const
+    const T& operator()(const std::vector<std::size_t>& idx) const
    {
        return mData[mDesc.GetOffsetFromMultiIndex(idx) / ck::packed_size_v<ck::remove_cvref_t<T>>];
    }