diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp index e41a961103..b97799203b 100644 --- a/example/12_reduce/reduce_blockwise.cpp +++ b/example/12_reduce/reduce_blockwise.cpp @@ -261,7 +261,7 @@ int main(int argc, char* argv[]) float alpha = args.scales[0]; float beta = args.scales[1]; - std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t num_thread = 1; if(args.do_verification) { diff --git a/library/include/ck/library/host_tensor/host_reduction.hpp b/library/include/ck/library/host_tensor/host_reduction.hpp index fe9fba6121..4cc8f3fefd 100644 --- a/library/include/ck/library/host_tensor/host_reduction.hpp +++ b/library/include/ck/library/host_tensor/host_reduction.hpp @@ -277,7 +277,7 @@ struct ReductionHost out_indices[dst_offset] = accuIndex; }; - std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t num_thread = 1; std::size_t work_per_thread = (invariant_dim_indexes.size() + num_thread - 1) / num_thread; @@ -374,7 +374,7 @@ struct ReductionHost out_data[dst_offset] = type_convert(accuVal); }; - std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t num_thread = 1; std::size_t work_per_thread = (invariant_dim_indexes.size() + num_thread - 1) / num_thread; diff --git a/library/include/ck/library/host_tensor/host_tensor.hpp b/library/include/ck/library/host_tensor/host_tensor.hpp index 443e0f9e4c..17ecd4a9fb 100644 --- a/library/include/ck/library/host_tensor/host_tensor.hpp +++ b/library/include/ck/library/host_tensor/host_tensor.hpp @@ -163,7 +163,7 @@ struct ParallelTensorFunctor return indices; } - void operator()(std::size_t num_thread = std::thread::hardware_concurrency()) const + void operator()(std::size_t num_thread = 1) const { std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread; @@ -213,7 +213,7 @@ struct Tensor Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {} template - void GenerateTensorValue(G g, std::size_t num_thread = std::thread::hardware_concurrency()) + void GenerateTensorValue(G g, std::size_t num_thread = 1) { switch(mDesc.GetNumOfDimension()) { diff --git a/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp b/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp index d818f3c950..9c09936a3b 100644 --- a/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp +++ b/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp @@ -302,7 +302,7 @@ int main(int argc, char* argv[]) print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w)); print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w)); - std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t num_thread = 1; switch(init_method) { diff --git a/library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp b/library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp index 7082f1050c..f350f7f071 100644 --- a/library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp +++ b/library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp @@ -317,7 +317,7 @@ int main(int argc, char* argv[]) print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w)); print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w)); - std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t num_thread = 1; switch(init_method) { diff --git a/library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp b/library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp index a6f47c5de5..9bdca437c9 100644 --- a/library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp +++ b/library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp @@ -319,7 +319,7 @@ int main(int argc, char* argv[]) print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w)); print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w)); - std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t num_thread = 1; switch(init_method) { diff --git a/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp b/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp index 6b34254c74..6f28af8bd3 100644 --- a/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp +++ b/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp @@ -282,7 +282,7 @@ int main(int argc, char* argv[]) print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w)); print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w)); - std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t num_thread = 1; switch(init_method) { diff --git a/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp b/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp index d8a22bda33..846ce94f91 100644 --- a/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp +++ b/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp @@ -300,7 +300,7 @@ int main(int argc, char* argv[]) print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w)); print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w)); - std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t num_thread = 1; switch(init_method) { diff --git a/library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp b/library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp index 0151fea9e5..253b5c2377 100644 --- a/library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp +++ b/library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp @@ -289,7 +289,7 @@ int main(int argc, char* argv[]) print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w)); print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w)); - std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t num_thread = 1; switch(init_method) { diff --git a/library/src/obselete_driver_offline/gemm_driver_offline.cpp b/library/src/obselete_driver_offline/gemm_driver_offline.cpp index 0c59bea620..8e281f71b1 100644 --- a/library/src/obselete_driver_offline/gemm_driver_offline.cpp +++ b/library/src/obselete_driver_offline/gemm_driver_offline.cpp @@ -313,7 +313,7 @@ int main(int argc, char* argv[]) ostream_HostTensorDescriptor(b.mDesc, std::cout << "b: "); ostream_HostTensorDescriptor(c_host.mDesc, std::cout << "c: "); - std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t num_thread = 1; switch(init_method) { diff --git a/profiler/include/profile_batched_gemm_impl.hpp b/profiler/include/profile_batched_gemm_impl.hpp index 07e687ebf6..7c39ce685c 100644 --- a/profiler/include/profile_batched_gemm_impl.hpp +++ b/profiler/include/profile_batched_gemm_impl.hpp @@ -103,7 +103,7 @@ bool profile_batched_gemm_impl(int do_verification, std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl; std::cout << "c_g_m_n: " << c_g_m_n_host_result.mDesc << std::endl; - std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t num_thread = 1; switch(init_method) { case 0: break; diff --git a/profiler/include/profile_gemm_bias_2d_impl.hpp b/profiler/include/profile_gemm_bias_2d_impl.hpp index 935725a808..4980726d96 100644 --- a/profiler/include/profile_gemm_bias_2d_impl.hpp +++ b/profiler/include/profile_gemm_bias_2d_impl.hpp @@ -98,7 +98,7 @@ void profile_gemm_bias_2d_impl(int do_verification, std::cout << "c0_m_n: " << c0_m_n.mDesc << std::endl; std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; - std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t num_thread = 1; switch(init_method) { case 0: break; diff --git a/profiler/include/profile_gemm_bias_relu_impl.hpp b/profiler/include/profile_gemm_bias_relu_impl.hpp index e403a88d58..55b6e39064 100644 --- a/profiler/include/profile_gemm_bias_relu_impl.hpp +++ b/profiler/include/profile_gemm_bias_relu_impl.hpp @@ -83,7 +83,7 @@ void profile_gemm_bias_relu_impl(int do_verification, std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; std::cout << "c0_n: " << c0_n.mDesc << std::endl; - std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t num_thread = 1; switch(init_method) { case 0: break; diff --git a/profiler/include/profile_gemm_impl.hpp b/profiler/include/profile_gemm_impl.hpp index 409293a22a..409c1fd43c 100644 --- a/profiler/include/profile_gemm_impl.hpp +++ b/profiler/include/profile_gemm_impl.hpp @@ -120,7 +120,7 @@ void profile_gemm_impl(int do_verification, std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl; - std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t num_thread = 1; switch(init_method) { case 0: break; @@ -408,6 +408,10 @@ void profile_gemm_impl(int do_verification, if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) { + // re-init C to zero before profiling next kernel + c_m_n_device_result.GenerateTensorValue(GeneratorTensor_0{}, num_thread); + c_device_buf.ToDevice(c_m_n_device_result.mData.data()); + std::string gemm_name = gemm_ptr->GetTypeString(); float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); diff --git a/profiler/include/profile_gemm_reduce_impl.hpp b/profiler/include/profile_gemm_reduce_impl.hpp index 8b3a85a208..e103aeff99 100644 --- a/profiler/include/profile_gemm_reduce_impl.hpp +++ b/profiler/include/profile_gemm_reduce_impl.hpp @@ -98,7 +98,7 @@ bool profile_gemm_reduce_impl(int do_verification, std::cout << "d0_m: " << d0_m_host_result.mDesc << std::endl; std::cout << "d1_m: " << d1_m_host_result.mDesc << std::endl; - std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t num_thread = 1; switch(init_method) { case 0: break; diff --git a/profiler/include/profile_grouped_gemm_impl.hpp b/profiler/include/profile_grouped_gemm_impl.hpp index 33ea11c341..4bdff7cbfc 100644 --- a/profiler/include/profile_grouped_gemm_impl.hpp +++ b/profiler/include/profile_grouped_gemm_impl.hpp @@ -95,7 +95,7 @@ void profile_grouped_gemm_impl(int do_verification, << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i << "]:" << c_m_n_device_results[i].mDesc << std::endl; - std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t num_thread = 1; switch(init_method) { case 0: break; diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp index c03f955ad3..54068e234e 100644 --- a/profiler/include/profile_reduce_impl.hpp +++ b/profiler/include/profile_reduce_impl.hpp @@ -242,7 +242,7 @@ void profile_reduce_impl_impl(bool do_verification, size_t invariant_total_length = out.mDesc.GetElementSize(); size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length; - std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t num_thread = 1; if(do_verification) { diff --git a/test/gemm_split_k/gemm_split_k.cpp b/test/gemm_split_k/gemm_split_k.cpp index 98a98b5518..a3d4f9b2ec 100644 --- a/test/gemm_split_k/gemm_split_k.cpp +++ b/test/gemm_split_k/gemm_split_k.cpp @@ -120,7 +120,7 @@ int test_gemm(const gemmArgs& args) f_host_tensor_descriptor(args.M, args.N, args.StrideC, c_row_major)); // init data - std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t num_thread = 1; a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); // set zero to c_device_buf diff --git a/test/reduce/reduce_no_index.cpp b/test/reduce/reduce_no_index.cpp index 099ee96018..e267dcc433 100644 --- a/test/reduce/reduce_no_index.cpp +++ b/test/reduce/reduce_no_index.cpp @@ -101,7 +101,7 @@ bool test_reduce_no_index_impl(int init_method, size_t invariant_total_length = out.mDesc.GetElementSize(); size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length; - std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t num_thread = 1; switch(init_method) { diff --git a/test/reduce/reduce_with_index.cpp b/test/reduce/reduce_with_index.cpp index 911f17d8f0..2ea13e831c 100644 --- a/test/reduce/reduce_with_index.cpp +++ b/test/reduce/reduce_with_index.cpp @@ -99,7 +99,7 @@ bool test_reduce_with_index_impl(int init_method, size_t invariant_total_length = out.mDesc.GetElementSize(); size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length; - std::size_t num_thread = std::thread::hardware_concurrency(); + std::size_t num_thread = 1; switch(init_method) {