diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp
index e41a961103..b97799203b 100644
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -261,7 +261,7 @@ int main(int argc, char* argv[])
     float alpha = args.scales[0];
     float beta  = args.scales[1];
 
-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
 
     if(args.do_verification)
     {
diff --git a/library/include/ck/library/host_tensor/host_reduction.hpp b/library/include/ck/library/host_tensor/host_reduction.hpp
index fe9fba6121..4cc8f3fefd 100644
--- a/library/include/ck/library/host_tensor/host_reduction.hpp
+++ b/library/include/ck/library/host_tensor/host_reduction.hpp
@@ -277,7 +277,7 @@ struct ReductionHost
                 out_indices[dst_offset] = accuIndex;
             };
 
-            std::size_t num_thread = std::thread::hardware_concurrency();
+            std::size_t num_thread = 1;
             std::size_t work_per_thread =
                 (invariant_dim_indexes.size() + num_thread - 1) / num_thread;
 
@@ -374,7 +374,7 @@ struct ReductionHost
                 out_data[dst_offset] = type_convert<OutDataType>(accuVal);
             };
 
-            std::size_t num_thread = std::thread::hardware_concurrency();
+            std::size_t num_thread = 1;
             std::size_t work_per_thread =
                 (invariant_dim_indexes.size() + num_thread - 1) / num_thread;
 
diff --git a/library/include/ck/library/host_tensor/host_tensor.hpp b/library/include/ck/library/host_tensor/host_tensor.hpp
index 443e0f9e4c..17ecd4a9fb 100644
--- a/library/include/ck/library/host_tensor/host_tensor.hpp
+++ b/library/include/ck/library/host_tensor/host_tensor.hpp
@@ -163,7 +163,7 @@ struct ParallelTensorFunctor
         return indices;
     }
 
-    void operator()(std::size_t num_thread = std::thread::hardware_concurrency()) const
+    void operator()(std::size_t num_thread = 1) const
     {
         std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
 
@@ -213,7 +213,7 @@ struct Tensor
     Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}
 
     template <typename G>
-    void GenerateTensorValue(G g, std::size_t num_thread = std::thread::hardware_concurrency())
+    void GenerateTensorValue(G g, std::size_t num_thread = 1)
     {
         switch(mDesc.GetNumOfDimension())
         {
diff --git a/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp b/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
index d818f3c950..9c09936a3b 100644
--- a/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
+++ b/library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
@@ -302,7 +302,7 @@ int main(int argc, char* argv[])
     print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
     print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
 
-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
 
     switch(init_method)
     {
diff --git a/library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp b/library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
index 7082f1050c..f350f7f071 100644
--- a/library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
+++ b/library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
@@ -317,7 +317,7 @@ int main(int argc, char* argv[])
     print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
     print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
 
-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
 
     switch(init_method)
     {
diff --git a/library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp b/library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
index a6f47c5de5..9bdca437c9 100644
--- a/library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
+++ b/library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
@@ -319,7 +319,7 @@ int main(int argc, char* argv[])
     print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
     print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
 
-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
 
     switch(init_method)
     {
diff --git a/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp b/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
index 6b34254c74..6f28af8bd3 100644
--- a/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
+++ b/library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
@@ -282,7 +282,7 @@ int main(int argc, char* argv[])
     print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
     print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
 
-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
 
     switch(init_method)
     {
diff --git a/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp b/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
index d8a22bda33..846ce94f91 100644
--- a/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
+++ b/library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
@@ -300,7 +300,7 @@ int main(int argc, char* argv[])
     print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
     print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
 
-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
 
     switch(init_method)
     {
diff --git a/library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp b/library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp
index 0151fea9e5..253b5c2377 100644
--- a/library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp
+++ b/library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp
@@ -289,7 +289,7 @@ int main(int argc, char* argv[])
     print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
     print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
 
-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
 
     switch(init_method)
     {
diff --git a/library/src/obselete_driver_offline/gemm_driver_offline.cpp b/library/src/obselete_driver_offline/gemm_driver_offline.cpp
index 0c59bea620..8e281f71b1 100644
--- a/library/src/obselete_driver_offline/gemm_driver_offline.cpp
+++ b/library/src/obselete_driver_offline/gemm_driver_offline.cpp
@@ -313,7 +313,7 @@ int main(int argc, char* argv[])
     ostream_HostTensorDescriptor(b.mDesc, std::cout << "b: ");
     ostream_HostTensorDescriptor(c_host.mDesc, std::cout << "c: ");
 
-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
 
     switch(init_method)
     {
diff --git a/profiler/include/profile_batched_gemm_impl.hpp b/profiler/include/profile_batched_gemm_impl.hpp
index 07e687ebf6..7c39ce685c 100644
--- a/profiler/include/profile_batched_gemm_impl.hpp
+++ b/profiler/include/profile_batched_gemm_impl.hpp
@@ -103,7 +103,7 @@ bool profile_batched_gemm_impl(int do_verification,
     std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
     std::cout << "c_g_m_n: " << c_g_m_n_host_result.mDesc << std::endl;
 
-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
     switch(init_method)
     {
     case 0: break;
diff --git a/profiler/include/profile_gemm_bias_2d_impl.hpp b/profiler/include/profile_gemm_bias_2d_impl.hpp
index 935725a808..4980726d96 100644
--- a/profiler/include/profile_gemm_bias_2d_impl.hpp
+++ b/profiler/include/profile_gemm_bias_2d_impl.hpp
@@ -98,7 +98,7 @@ void profile_gemm_bias_2d_impl(int do_verification,
     std::cout << "c0_m_n: " << c0_m_n.mDesc << std::endl;
     std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
 
-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
     switch(init_method)
     {
     case 0: break;
diff --git a/profiler/include/profile_gemm_bias_relu_impl.hpp b/profiler/include/profile_gemm_bias_relu_impl.hpp
index e403a88d58..55b6e39064 100644
--- a/profiler/include/profile_gemm_bias_relu_impl.hpp
+++ b/profiler/include/profile_gemm_bias_relu_impl.hpp
@@ -83,7 +83,7 @@ void profile_gemm_bias_relu_impl(int do_verification,
     std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
     std::cout << "c0_n: " << c0_n.mDesc << std::endl;
 
-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
     switch(init_method)
     {
     case 0: break;
diff --git a/profiler/include/profile_gemm_impl.hpp b/profiler/include/profile_gemm_impl.hpp
index 409293a22a..409c1fd43c 100644
--- a/profiler/include/profile_gemm_impl.hpp
+++ b/profiler/include/profile_gemm_impl.hpp
@@ -120,7 +120,7 @@ void profile_gemm_impl(int do_verification,
     std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
     std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
 
-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
     switch(init_method)
     {
     case 0: break;
@@ -408,6 +408,10 @@ void profile_gemm_impl(int do_verification,
 
         if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
         {
+            // re-init C to zero before profiling next kernel
+            c_m_n_device_result.GenerateTensorValue(GeneratorTensor_0<CDataType>{}, num_thread);
+            c_device_buf.ToDevice(c_m_n_device_result.mData.data());
+
             std::string gemm_name = gemm_ptr->GetTypeString();
 
             float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
diff --git a/profiler/include/profile_gemm_reduce_impl.hpp b/profiler/include/profile_gemm_reduce_impl.hpp
index 8b3a85a208..e103aeff99 100644
--- a/profiler/include/profile_gemm_reduce_impl.hpp
+++ b/profiler/include/profile_gemm_reduce_impl.hpp
@@ -98,7 +98,7 @@ bool profile_gemm_reduce_impl(int do_verification,
     std::cout << "d0_m: " << d0_m_host_result.mDesc << std::endl;
     std::cout << "d1_m: " << d1_m_host_result.mDesc << std::endl;
 
-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
     switch(init_method)
     {
     case 0: break;
diff --git a/profiler/include/profile_grouped_gemm_impl.hpp b/profiler/include/profile_grouped_gemm_impl.hpp
index 33ea11c341..4bdff7cbfc 100644
--- a/profiler/include/profile_grouped_gemm_impl.hpp
+++ b/profiler/include/profile_grouped_gemm_impl.hpp
@@ -95,7 +95,7 @@ void profile_grouped_gemm_impl(int do_verification,
                   << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
                   << "]:" << c_m_n_device_results[i].mDesc << std::endl;
 
-        std::size_t num_thread = std::thread::hardware_concurrency();
+        std::size_t num_thread = 1;
         switch(init_method)
         {
         case 0: break;
diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp
index c03f955ad3..54068e234e 100644
--- a/profiler/include/profile_reduce_impl.hpp
+++ b/profiler/include/profile_reduce_impl.hpp
@@ -242,7 +242,7 @@ void profile_reduce_impl_impl(bool do_verification,
         size_t invariant_total_length = out.mDesc.GetElementSize();
         size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
 
-        std::size_t num_thread = std::thread::hardware_concurrency();
+        std::size_t num_thread = 1;
 
         if(do_verification)
         {
diff --git a/test/gemm_split_k/gemm_split_k.cpp b/test/gemm_split_k/gemm_split_k.cpp
index 98a98b5518..a3d4f9b2ec 100644
--- a/test/gemm_split_k/gemm_split_k.cpp
+++ b/test/gemm_split_k/gemm_split_k.cpp
@@ -120,7 +120,7 @@ int test_gemm(const gemmArgs& args)
         f_host_tensor_descriptor(args.M, args.N, args.StrideC, c_row_major));
 
     // init data
-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
     a_m_k.GenerateTensorValue(GeneratorTensor_2<float>{-5, 5}, num_thread);
     b_k_n.GenerateTensorValue(GeneratorTensor_2<float>{-5, 5}, num_thread);
     // set zero to c_device_buf
diff --git a/test/reduce/reduce_no_index.cpp b/test/reduce/reduce_no_index.cpp
index 099ee96018..e267dcc433 100644
--- a/test/reduce/reduce_no_index.cpp
+++ b/test/reduce/reduce_no_index.cpp
@@ -101,7 +101,7 @@ bool test_reduce_no_index_impl(int init_method,
     size_t invariant_total_length = out.mDesc.GetElementSize();
     size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
 
-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
 
     switch(init_method)
     {
diff --git a/test/reduce/reduce_with_index.cpp b/test/reduce/reduce_with_index.cpp
index 911f17d8f0..2ea13e831c 100644
--- a/test/reduce/reduce_with_index.cpp
+++ b/test/reduce/reduce_with_index.cpp
@@ -99,7 +99,7 @@ bool test_reduce_with_index_impl(int init_method,
     size_t invariant_total_length = out.mDesc.GetElementSize();
     size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
 
-    std::size_t num_thread = std::thread::hardware_concurrency();
+    std::size_t num_thread = 1;
 
     switch(init_method)
     {