Fix the slow cpu reference batched gemm kernels. (#388)

* fix the performance of the batched gemm verification

* fix tabs

[ROCm/composable_kernel commit: 9061d39bd6]
This commit is contained in:
Illia Silin
2022-08-29 06:39:21 -07:00
committed by GitHub
parent 36415d87dc
commit 57c1172870

View File

@@ -83,8 +83,8 @@ struct ReferenceBatchedGemm : public device::BaseOperator
make_ParallelTensorFunctor(f_gmk_gkn_gmn,
arg.c_g_m_n_.mDesc.GetLengths()[0],
arg.c_g_m_n_.mDesc.GetLengths()[1],
arg.c_g_m_n_.mDesc.GetLengths()[2])();
arg.c_g_m_n_.mDesc.GetLengths()[2])(
std::thread::hardware_concurrency());
return 0;
}