diff --git a/CMakeLists.txt b/CMakeLists.txt index 29e39f79..a12389ed 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,6 +21,7 @@ option(MSCCLPP_BUILD_APPS_NCCL "Build NCCL interfaces" ON) option(MSCCLPP_USE_CUDA "Use NVIDIA/CUDA." OFF) option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF) option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF) +option(MSCCLPP_NPKIT_FLAGS "Enable NPKIT" OFF) if(MSCCLPP_BYPASS_GPU_CHECK) if(MSCCLPP_USE_CUDA) @@ -122,8 +123,8 @@ endif() if(MSCCLPP_ENABLE_TRACE) target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_ENABLE_TRACE) endif() -if(NPKIT_FLAGS) - target_compile_definitions(mscclpp_obj PRIVATE ${NPKIT_FLAGS}) +if(MSCCLPP_NPKIT_FLAGS) + target_compile_definitions(mscclpp_obj PRIVATE ${MSCCLPP_NPKIT_FLAGS}) endif() # libmscclpp diff --git a/apps/nccl/CMakeLists.txt b/apps/nccl/CMakeLists.txt index 7d8804bd..0377f009 100644 --- a/apps/nccl/CMakeLists.txt +++ b/apps/nccl/CMakeLists.txt @@ -19,7 +19,9 @@ if(MSCCLPP_USE_CUDA) elseif(MSCCLPP_USE_ROCM) target_compile_definitions(mscclpp_nccl_obj PRIVATE MSCCLPP_USE_ROCM) endif() - +if(MSCCLPP_NPKIT_FLAGS) + target_compile_definitions(mscclpp_nccl_obj PRIVATE ${MSCCLPP_NPKIT_FLAGS}) +endif() add_library(mscclpp_nccl SHARED) target_link_libraries(mscclpp_nccl PUBLIC mscclpp_obj mscclpp_nccl_obj) set_target_properties(mscclpp_nccl PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION}) diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index 4bd85897..479c971a 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -12,6 +12,10 @@ #include #include +#if defined(ENABLE_NPKIT) +#include +#endif + #include "common.hpp" template @@ -238,10 +242,40 @@ template __global__ void __launch_bounds__(1024, 1) allreduce7(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, size_t channelDataOffset, size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize, - size_t nelems, uint32_t flag) { + size_t nelems, uint32_t flag +#if defined(ENABLE_NPKIT) + , + NpKitEventCollectContext* npKitEventCollectContexts, uint64_t* cpuTimestamp) { +#else + ) { +#endif // This version of allreduce only works for single nodes if (worldSize != nRanksPerNode) return; +#if defined(ENABLE_NPKIT) + extern __shared__ int4 NpkitSharedMem[]; + NpKitEvent* event_buffer = (NpKitEvent*)((char*)NpkitSharedMem); + uint64_t event_buffer_head = 0; +#if defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY) && defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT) + uint64_t npkit_timestamp_entry = 0; + if (threadIdx.x == 0) { + npkit_timestamp_entry = NPKIT_GET_GPU_TIMESTAMP(); + } +#endif +#endif +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) +#if defined(MSCCLPP_DEVICE_HIP) + NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, NPKIT_LOAD_CPU_TIMESTAMP_PER_BLOCK(cpuTimestamp, blockIdx.x), +#else + NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, +#endif + event_buffer, &event_buffer_head); +#endif +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) + NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(), event_buffer, + &event_buffer_head); +#endif + if (sizeof(T) == 2) nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int); else @@ -312,6 +346,16 @@ __global__ void __launch_bounds__(1024, 1) result[idx].x = data.x; result[idx].y = data.y; } +#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY) && \ + defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT) + NpKit::CollectGpuEventShm(NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY, 0, 0, npkit_timestamp_entry, event_buffer, + &event_buffer_head); + NpKit::CollectGpuEventShm(NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT, 0, 0, NPKIT_GET_GPU_TIMESTAMP(), event_buffer, + &event_buffer_head); +#endif +#if defined(ENABLE_NPKIT) + NpKit::StoreGpuEventShm(npKitEventCollectContexts, event_buffer, event_buffer_head); +#endif } template @@ -470,9 +514,16 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< nBlocks = 56; nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024; } +#if defined(ENABLE_NPKIT) + size_t NpkitSharedMemSize = NPKIT_SHM_NUM_EVENTS * sizeof(NpKitEvent); + allreduce7<<>>(buff, scratch, resultBuff, smChannels, channelInOffset, + channelScratchOffset, rank, nRanksPerNode, worldSize, nelems, + flag++, NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp()); +#else allreduce7<<>>(buff, scratch, resultBuff, smChannels, channelInOffset, channelScratchOffset, rank, nRanksPerNode, worldSize, nelems, flag++); +#endif } else { int nBlocks = 35; int nThreadsPerBlock = 512; diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index 55562c22..802d399e 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -12,7 +12,9 @@ #include #include #include - +#if defined(ENABLE_NPKIT) +#include +#endif #include "allgather.hpp" #include "allreduce.hpp" #include "broadcast.hpp" @@ -427,6 +429,12 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI } *comm = commPtr; +#if defined(ENABLE_NPKIT) + const char* npkitDumpDir = getenv("NPKIT_DUMP_DIR"); + if (npkitDumpDir != nullptr) { + NpKit::Init(rank); + } +#endif return ncclSuccess; } @@ -446,6 +454,13 @@ NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) { WARN("comm is nullptr"); return ncclInvalidArgument; } +#if defined(ENABLE_NPKIT) + const char* npkitDumpDir = getenv("NPKIT_DUMP_DIR"); + if (npkitDumpDir != nullptr) { + NpKit::Dump(npkitDumpDir); + NpKit::Shutdown(); + } +#endif delete comm; return ncclSuccess; } diff --git a/include/mscclpp/npkit/npkit_event.hpp b/include/mscclpp/npkit/npkit_event.hpp index 53e8335a..8be60525 100644 --- a/include/mscclpp/npkit/npkit_event.hpp +++ b/include/mscclpp/npkit/npkit_event.hpp @@ -37,7 +37,10 @@ #define NPKIT_EVENT_EXECUTOR_INIT_ENTRY 0x19 #define NPKIT_EVENT_EXECUTOR_INIT_EXIT 0x1A -#define NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY 0x1B -#define NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT 0x2E +#define NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY 0x1B +#define NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT 0x1C + +#define NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY 0x1D +#define NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT 0x30 #endif