use single threaded tensor generator (#161)

[ROCm/composable_kernel commit: f015c77687]
2026-05-14 02:02:46 +00:00 · 2022-03-31 11:28:30 +08:00
parent 297ef9795d
commit 8bb6c6e120
20 changed files with 26 additions and 22 deletions
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -261,7 +261,7 @@ int main(int argc, char* argv[])
    float alpha = args.scales[0];
    float beta  = args.scales[1];

-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;

    if(args.do_verification)
    {
--- a/library/include/ck/library/host_tensor/host_reduction.hpp
+++ b/library/include/ck/library/host_tensor/host_reduction.hpp
@@ -277,7 +277,7 @@ struct ReductionHost
                out_indices[dst_offset] = accuIndex;
            };

-            std::size_t num_thread = std::thread::hardware_concurrency();
+            std::size_t num_thread = 1;
            std::size_t work_per_thread =
                (invariant_dim_indexes.size() + num_thread - 1) / num_thread;

@@ -374,7 +374,7 @@ struct ReductionHost
                out_data[dst_offset] = type_convert<OutDataType>(accuVal);
            };

-            std::size_t num_thread = std::thread::hardware_concurrency();
+            std::size_t num_thread = 1;
            std::size_t work_per_thread =
                (invariant_dim_indexes.size() + num_thread - 1) / num_thread;

--- a/library/include/ck/library/host_tensor/host_tensor.hpp
+++ b/library/include/ck/library/host_tensor/host_tensor.hpp
@@ -163,7 +163,7 @@ struct ParallelTensorFunctor
        return indices;
    }

-    void operator()(std::size_t num_thread = std::thread::hardware_concurrency()) const
+    void operator()(std::size_t num_thread = 1) const
    {
        std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;

@@ -213,7 +213,7 @@ struct Tensor
    Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}

    template <typename G>
-    void GenerateTensorValue(G g, std::size_t num_thread = std::thread::hardware_concurrency())
+    void GenerateTensorValue(G g, std::size_t num_thread = 1)
    {
        switch(mDesc.GetNumOfDimension())
        {
--- a/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
+++ b/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
@@ -302,7 +302,7 @@ int main(int argc, char* argv[])
    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));

-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;

    switch(init_method)
    {
--- a/library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
+++ b/library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
@@ -317,7 +317,7 @@ int main(int argc, char* argv[])
    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));

-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;

    switch(init_method)
    {
--- a/library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
+++ b/library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
@@ -319,7 +319,7 @@ int main(int argc, char* argv[])
    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));

-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;

    switch(init_method)
    {
--- a/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
+++ b/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
@@ -282,7 +282,7 @@ int main(int argc, char* argv[])
    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));

-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;

    switch(init_method)
    {
--- a/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
+++ b/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
@@ -300,7 +300,7 @@ int main(int argc, char* argv[])
    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));

-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;

    switch(init_method)
    {
--- a/library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp
+++ b/library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp
@@ -289,7 +289,7 @@ int main(int argc, char* argv[])
    print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
    print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));

-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;

    switch(init_method)
    {
--- a/library/src/obselete_driver_offline/gemm_driver_offline.cpp
+++ b/library/src/obselete_driver_offline/gemm_driver_offline.cpp
@@ -313,7 +313,7 @@ int main(int argc, char* argv[])
    ostream_HostTensorDescriptor(b.mDesc, std::cout << "b: ");
    ostream_HostTensorDescriptor(c_host.mDesc, std::cout << "c: ");

-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;

    switch(init_method)
    {
--- a/profiler/include/profile_batched_gemm_impl.hpp
+++ b/profiler/include/profile_batched_gemm_impl.hpp
@@ -103,7 +103,7 @@ bool profile_batched_gemm_impl(int do_verification,
    std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
    std::cout << "c_g_m_n: " << c_g_m_n_host_result.mDesc << std::endl;

-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
    switch(init_method)
    {
    case 0: break;
--- a/profiler/include/profile_gemm_bias_2d_impl.hpp
+++ b/profiler/include/profile_gemm_bias_2d_impl.hpp
@@ -98,7 +98,7 @@ void profile_gemm_bias_2d_impl(int do_verification,
    std::cout << "c0_m_n: " << c0_m_n.mDesc << std::endl;
    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;

-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
    switch(init_method)
    {
    case 0: break;
--- a/profiler/include/profile_gemm_bias_relu_impl.hpp
+++ b/profiler/include/profile_gemm_bias_relu_impl.hpp
@@ -83,7 +83,7 @@ void profile_gemm_bias_relu_impl(int do_verification,
    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
    std::cout << "c0_n: " << c0_n.mDesc << std::endl;

-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
    switch(init_method)
    {
    case 0: break;
--- a/profiler/include/profile_gemm_impl.hpp
+++ b/profiler/include/profile_gemm_impl.hpp
@@ -120,7 +120,7 @@ void profile_gemm_impl(int do_verification,
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
    std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;

-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
    switch(init_method)
    {
    case 0: break;
@@ -408,6 +408,10 @@ void profile_gemm_impl(int do_verification,

        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            // re-init C to zero before profiling next kernel
+            c_m_n_device_result.GenerateTensorValue(GeneratorTensor_0<CDataType>{}, num_thread);
+            c_device_buf.ToDevice(c_m_n_device_result.mData.data());
+
            std::string gemm_name = gemm_ptr->GetTypeString();

            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
--- a/profiler/include/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_gemm_reduce_impl.hpp
@@ -98,7 +98,7 @@ bool profile_gemm_reduce_impl(int do_verification,
    std::cout << "d0_m: " << d0_m_host_result.mDesc << std::endl;
    std::cout << "d1_m: " << d1_m_host_result.mDesc << std::endl;

-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
    switch(init_method)
    {
    case 0: break;
--- a/profiler/include/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profile_grouped_gemm_impl.hpp
@@ -95,7 +95,7 @@ void profile_grouped_gemm_impl(int do_verification,
                  << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
                  << "]:" << c_m_n_device_results[i].mDesc << std::endl;

-        std::size_t num_thread = std::thread::hardware_concurrency();
+        std::size_t num_thread = 1;
        switch(init_method)
        {
        case 0: break;
--- a/profiler/include/profile_reduce_impl.hpp
+++ b/profiler/include/profile_reduce_impl.hpp
@@ -242,7 +242,7 @@ void profile_reduce_impl_impl(bool do_verification,
        size_t invariant_total_length = out.mDesc.GetElementSize();
        size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;

-        std::size_t num_thread = std::thread::hardware_concurrency();
+        std::size_t num_thread = 1;

        if(do_verification)
        {
--- a/test/gemm_split_k/gemm_split_k.cpp
+++ b/test/gemm_split_k/gemm_split_k.cpp
@@ -120,7 +120,7 @@ int test_gemm(const gemmArgs& args)
        f_host_tensor_descriptor(args.M, args.N, args.StrideC, c_row_major));

    // init data
-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
    a_m_k.GenerateTensorValue(GeneratorTensor_2<float>{-5, 5}, num_thread);
    b_k_n.GenerateTensorValue(GeneratorTensor_2<float>{-5, 5}, num_thread);
    // set zero to c_device_buf
--- a/test/reduce/reduce_no_index.cpp
+++ b/test/reduce/reduce_no_index.cpp
@@ -101,7 +101,7 @@ bool test_reduce_no_index_impl(int init_method,
    size_t invariant_total_length = out.mDesc.GetElementSize();
    size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;

-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;

    switch(init_method)
    {
--- a/test/reduce/reduce_with_index.cpp
+++ b/test/reduce/reduce_with_index.cpp
@@ -99,7 +99,7 @@ bool test_reduce_with_index_impl(int init_method,
    size_t invariant_total_length = out.mDesc.GetElementSize();
    size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;

-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;

    switch(init_method)
    {