use single threaded tensor generator (#161)

[ROCm/composable_kernel commit: f015c77687]
2026-05-21 21:39:15 +00:00 · 2022-03-31 11:28:30 +08:00
parent 297ef9795d
commit 8bb6c6e120
20 changed files with 26 additions and 22 deletions
--- a/profiler/include/profile_batched_gemm_impl.hpp
+++ b/profiler/include/profile_batched_gemm_impl.hpp
@@ -103,7 +103,7 @@ bool profile_batched_gemm_impl(int do_verification,
    std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
    std::cout << "c_g_m_n: " << c_g_m_n_host_result.mDesc << std::endl;

-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
    switch(init_method)
    {
    case 0: break;
--- a/profiler/include/profile_gemm_bias_2d_impl.hpp
+++ b/profiler/include/profile_gemm_bias_2d_impl.hpp
@@ -98,7 +98,7 @@ void profile_gemm_bias_2d_impl(int do_verification,
    std::cout << "c0_m_n: " << c0_m_n.mDesc << std::endl;
    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;

-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
    switch(init_method)
    {
    case 0: break;
--- a/profiler/include/profile_gemm_bias_relu_impl.hpp
+++ b/profiler/include/profile_gemm_bias_relu_impl.hpp
@@ -83,7 +83,7 @@ void profile_gemm_bias_relu_impl(int do_verification,
    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
    std::cout << "c0_n: " << c0_n.mDesc << std::endl;

-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
    switch(init_method)
    {
    case 0: break;
--- a/profiler/include/profile_gemm_impl.hpp
+++ b/profiler/include/profile_gemm_impl.hpp
@@ -120,7 +120,7 @@ void profile_gemm_impl(int do_verification,
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
    std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;

-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
    switch(init_method)
    {
    case 0: break;
@@ -408,6 +408,10 @@ void profile_gemm_impl(int do_verification,

        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
        {
+            // re-init C to zero before profiling next kernel
+            c_m_n_device_result.GenerateTensorValue(GeneratorTensor_0<CDataType>{}, num_thread);
+            c_device_buf.ToDevice(c_m_n_device_result.mData.data());
+
            std::string gemm_name = gemm_ptr->GetTypeString();

            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
--- a/profiler/include/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_gemm_reduce_impl.hpp
@@ -98,7 +98,7 @@ bool profile_gemm_reduce_impl(int do_verification,
    std::cout << "d0_m: " << d0_m_host_result.mDesc << std::endl;
    std::cout << "d1_m: " << d1_m_host_result.mDesc << std::endl;

-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
    switch(init_method)
    {
    case 0: break;
--- a/profiler/include/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profile_grouped_gemm_impl.hpp
@@ -95,7 +95,7 @@ void profile_grouped_gemm_impl(int do_verification,
                  << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
                  << "]:" << c_m_n_device_results[i].mDesc << std::endl;

-        std::size_t num_thread = std::thread::hardware_concurrency();
+        std::size_t num_thread = 1;
        switch(init_method)
        {
        case 0: break;
--- a/profiler/include/profile_reduce_impl.hpp
+++ b/profiler/include/profile_reduce_impl.hpp
@@ -242,7 +242,7 @@ void profile_reduce_impl_impl(bool do_verification,
        size_t invariant_total_length = out.mDesc.GetElementSize();
        size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;

-        std::size_t num_thread = std::thread::hardware_concurrency();
+        std::size_t num_thread = 1;

        if(do_verification)
        {