use single threaded tensor generator (#161)

[ROCm/composable_kernel commit: f015c77687]
2026-05-20 21:09:08 +00:00 · 2022-03-31 11:28:30 +08:00
parent 297ef9795d
commit 8bb6c6e120
20 changed files with 26 additions and 22 deletions
--- a/library/include/ck/library/host_tensor/host_reduction.hpp
+++ b/library/include/ck/library/host_tensor/host_reduction.hpp
@@ -277,7 +277,7 @@ struct ReductionHost
                out_indices[dst_offset] = accuIndex;
            };

-            std::size_t num_thread = std::thread::hardware_concurrency();
+            std::size_t num_thread = 1;
            std::size_t work_per_thread =
                (invariant_dim_indexes.size() + num_thread - 1) / num_thread;

@@ -374,7 +374,7 @@ struct ReductionHost
                out_data[dst_offset] = type_convert<OutDataType>(accuVal);
            };

-            std::size_t num_thread = std::thread::hardware_concurrency();
+            std::size_t num_thread = 1;
            std::size_t work_per_thread =
                (invariant_dim_indexes.size() + num_thread - 1) / num_thread;

--- a/library/include/ck/library/host_tensor/host_tensor.hpp
+++ b/library/include/ck/library/host_tensor/host_tensor.hpp
@@ -163,7 +163,7 @@ struct ParallelTensorFunctor
        return indices;
    }

-    void operator()(std::size_t num_thread = std::thread::hardware_concurrency()) const
+    void operator()(std::size_t num_thread = 1) const
    {
        std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;

@@ -213,7 +213,7 @@ struct Tensor
    Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}

    template <typename G>
-    void GenerateTensorValue(G g, std::size_t num_thread = std::thread::hardware_concurrency())
+    void GenerateTensorValue(G g, std::size_t num_thread = 1)
    {
        switch(mDesc.GetNumOfDimension())
        {