Separate NPKit CPU timestamp access from different blocks for AMD platform (#321)

Reference: https://github.com/ROCm/rccl/pull/1229
This commit is contained in:
Ziyue Yang
2024-07-02 19:36:48 +08:00
committed by GitHub
parent 0f796bbdf7
commit b5a48f836c
4 changed files with 47 additions and 2 deletions

View File

@@ -262,6 +262,14 @@ struct Executor::Impl {
static uint32_t flag = 0;
int nthreadblocks = context.deviceExecutionPlans.size();
#if defined(ENABLE_NPKIT)
#if defined(__HIP_PLATFORM_AMD__)
if (nthreadblocks > NPKIT_MAX_NUM_GPU_THREADBLOCKS) {
throw Error("Executor plan launching " + std::to_string(nthreadblocks) +
" thread blocks, exceeding NPKit support (" + std::to_string(NPKIT_MAX_NUM_GPU_THREADBLOCKS) +
")",
ErrorCode::ExecutorError);
}
#endif
size_t sharedMemSize = sizeof(DeviceExecutionPlan) + NPKIT_SHM_NUM_EVENTS * sizeof(NpKitEvent);
#else
size_t sharedMemSize = sizeof(DeviceExecutionPlan);