mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
use single threaded tensor generator (#161)
[ROCm/composable_kernel commit: f015c77687]
This commit is contained in:
@@ -261,7 +261,7 @@ int main(int argc, char* argv[])
|
||||
float alpha = args.scales[0];
|
||||
float beta = args.scales[1];
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t num_thread = 1;
|
||||
|
||||
if(args.do_verification)
|
||||
{
|
||||
|
||||
@@ -277,7 +277,7 @@ struct ReductionHost
|
||||
out_indices[dst_offset] = accuIndex;
|
||||
};
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t num_thread = 1;
|
||||
std::size_t work_per_thread =
|
||||
(invariant_dim_indexes.size() + num_thread - 1) / num_thread;
|
||||
|
||||
@@ -374,7 +374,7 @@ struct ReductionHost
|
||||
out_data[dst_offset] = type_convert<OutDataType>(accuVal);
|
||||
};
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t num_thread = 1;
|
||||
std::size_t work_per_thread =
|
||||
(invariant_dim_indexes.size() + num_thread - 1) / num_thread;
|
||||
|
||||
|
||||
@@ -163,7 +163,7 @@ struct ParallelTensorFunctor
|
||||
return indices;
|
||||
}
|
||||
|
||||
void operator()(std::size_t num_thread = std::thread::hardware_concurrency()) const
|
||||
void operator()(std::size_t num_thread = 1) const
|
||||
{
|
||||
std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
|
||||
|
||||
@@ -213,7 +213,7 @@ struct Tensor
|
||||
Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}
|
||||
|
||||
template <typename G>
|
||||
void GenerateTensorValue(G g, std::size_t num_thread = std::thread::hardware_concurrency())
|
||||
void GenerateTensorValue(G g, std::size_t num_thread = 1)
|
||||
{
|
||||
switch(mDesc.GetNumOfDimension())
|
||||
{
|
||||
|
||||
@@ -302,7 +302,7 @@ int main(int argc, char* argv[])
|
||||
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
|
||||
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t num_thread = 1;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
|
||||
@@ -317,7 +317,7 @@ int main(int argc, char* argv[])
|
||||
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
|
||||
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t num_thread = 1;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
|
||||
@@ -319,7 +319,7 @@ int main(int argc, char* argv[])
|
||||
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
|
||||
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t num_thread = 1;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
|
||||
@@ -282,7 +282,7 @@ int main(int argc, char* argv[])
|
||||
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
|
||||
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t num_thread = 1;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
|
||||
@@ -300,7 +300,7 @@ int main(int argc, char* argv[])
|
||||
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
|
||||
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t num_thread = 1;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
|
||||
@@ -289,7 +289,7 @@ int main(int argc, char* argv[])
|
||||
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
|
||||
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t num_thread = 1;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
|
||||
@@ -313,7 +313,7 @@ int main(int argc, char* argv[])
|
||||
ostream_HostTensorDescriptor(b.mDesc, std::cout << "b: ");
|
||||
ostream_HostTensorDescriptor(c_host.mDesc, std::cout << "c: ");
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t num_thread = 1;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
|
||||
@@ -103,7 +103,7 @@ bool profile_batched_gemm_impl(int do_verification,
|
||||
std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
|
||||
std::cout << "c_g_m_n: " << c_g_m_n_host_result.mDesc << std::endl;
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t num_thread = 1;
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
|
||||
@@ -98,7 +98,7 @@ void profile_gemm_bias_2d_impl(int do_verification,
|
||||
std::cout << "c0_m_n: " << c0_m_n.mDesc << std::endl;
|
||||
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t num_thread = 1;
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
|
||||
@@ -83,7 +83,7 @@ void profile_gemm_bias_relu_impl(int do_verification,
|
||||
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
|
||||
std::cout << "c0_n: " << c0_n.mDesc << std::endl;
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t num_thread = 1;
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
|
||||
@@ -120,7 +120,7 @@ void profile_gemm_impl(int do_verification,
|
||||
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
|
||||
std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t num_thread = 1;
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
@@ -408,6 +408,10 @@ void profile_gemm_impl(int do_verification,
|
||||
|
||||
if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
// re-init C to zero before profiling next kernel
|
||||
c_m_n_device_result.GenerateTensorValue(GeneratorTensor_0<CDataType>{}, num_thread);
|
||||
c_device_buf.ToDevice(c_m_n_device_result.mData.data());
|
||||
|
||||
std::string gemm_name = gemm_ptr->GetTypeString();
|
||||
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
|
||||
|
||||
@@ -98,7 +98,7 @@ bool profile_gemm_reduce_impl(int do_verification,
|
||||
std::cout << "d0_m: " << d0_m_host_result.mDesc << std::endl;
|
||||
std::cout << "d1_m: " << d1_m_host_result.mDesc << std::endl;
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t num_thread = 1;
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
|
||||
@@ -95,7 +95,7 @@ void profile_grouped_gemm_impl(int do_verification,
|
||||
<< "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
|
||||
<< "]:" << c_m_n_device_results[i].mDesc << std::endl;
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t num_thread = 1;
|
||||
switch(init_method)
|
||||
{
|
||||
case 0: break;
|
||||
|
||||
@@ -242,7 +242,7 @@ void profile_reduce_impl_impl(bool do_verification,
|
||||
size_t invariant_total_length = out.mDesc.GetElementSize();
|
||||
size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length;
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t num_thread = 1;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
|
||||
@@ -120,7 +120,7 @@ int test_gemm(const gemmArgs& args)
|
||||
f_host_tensor_descriptor(args.M, args.N, args.StrideC, c_row_major));
|
||||
|
||||
// init data
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t num_thread = 1;
|
||||
a_m_k.GenerateTensorValue(GeneratorTensor_2<float>{-5, 5}, num_thread);
|
||||
b_k_n.GenerateTensorValue(GeneratorTensor_2<float>{-5, 5}, num_thread);
|
||||
// set zero to c_device_buf
|
||||
|
||||
@@ -101,7 +101,7 @@ bool test_reduce_no_index_impl(int init_method,
|
||||
size_t invariant_total_length = out.mDesc.GetElementSize();
|
||||
size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length;
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t num_thread = 1;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
|
||||
@@ -99,7 +99,7 @@ bool test_reduce_with_index_impl(int init_method,
|
||||
size_t invariant_total_length = out.mDesc.GetElementSize();
|
||||
size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length;
|
||||
|
||||
std::size_t num_thread = std::thread::hardware_concurrency();
|
||||
std::size_t num_thread = 1;
|
||||
|
||||
switch(init_method)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user