From 1c156cf42f128b611e77c2569dbac33a6c3c8ee3 Mon Sep 17 00:00:00 2001 From: v-xiaoxshi Date: Thu, 16 Feb 2023 00:49:14 +0000 Subject: [PATCH 1/8] not complete yet --- src/bootstrap/init.cc | 60 ++++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/src/bootstrap/init.cc b/src/bootstrap/init.cc index 32c8592b..bbce5d85 100644 --- a/src/bootstrap/init.cc +++ b/src/bootstrap/init.cc @@ -154,32 +154,56 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, int rankRecv, int rankSend, v return mscclppSuccess; } +struct ipcMemHandleInfo { + cudaIpcMemHandle_t buffHandle; + cudaIpcMemHandle_t flagHandle; + int tag; + int valid; // indicates whether the handles are valid +}; + +mscclppResult_t mscclppP2pConnectionSetup(struct ipcMemHandleInfo* handleInfo /*output*/, struct mscclppConn* conn /*input*/){ + if (handleInfo == NULL || conn == NULL){ + WARN("ipcHandles or connection cannot be null"); + return mscclppInternalError; + } + CUDACHECK(cudaIpcGetMemHandle(&handleInfo->buffHandle, conn->buff)); + CUDACHECK(cudaIpcGetMemHandle(&handleInfo->flagHandle, conn->flag)); + handleInfo->tag = conn->tag; + handleInfo->valid = 1; + return mscclppSuccess; +} + + MSCCLPP_API(mscclppResult_t, mscclppConnectionSetup, mscclppComm_t comm); mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm) { - struct ipcMemHandleInfo { - cudaIpcMemHandle_t handle_buff; - cudaIpcMemHandle_t handle_flag; - int tag; - int valid; - }; - size_t shmSize = MAXCONNECTIONS * sizeof(struct ipcMemHandleInfo); - int fd; - struct ipcMemHandleInfo *handleInfos; - std::string shmname = mscclppShmFileName(comm, comm->localRank); - MSCCLPPCHECK(mscclppShmutilsMapCreate(shmname.c_str(), shmSize, &fd, (void **)&handleInfos)); + struct ipcMemHandleInfo* handleInfos; + // this could potentially be very large, but it's OK since it is on the CPU + MSCCLPPCHECK(mscclppCalloc(&handleInfos, MAXCONNECTIONS*comm->nRanks)); + + + // size_t shmSize = MAXCONNECTIONS * sizeof(struct ipcMemHandleInfo); + // int fd; + // struct ipcMemHandleInfo *handleInfos; + // std::string shmname = mscclppShmFileName(comm, comm->localRank); + // MSCCLPPCHECK(mscclppShmutilsMapCreate(shmname.c_str(), shmSize, &fd, (void **)&handleInfos)); for (int i = 0; i < comm->nConns; ++i) { struct mscclppConn *conn = &comm->conns[i]; - CUDACHECK(cudaIpcGetMemHandle(&handleInfos[i].handle_buff, conn->buff)); - CUDACHECK(cudaIpcGetMemHandle(&handleInfos[i].handle_flag, conn->flag)); - handleInfos[i].tag = conn->tag; - handleInfos[i].valid = 1; + struct ipcMemHandleInfo* handle = &handleInfos[comm->rank+i]; + if (conn->transport == mscclppP2pConnectionSetup){ + MSCCPPCHECK(mscclppP2pConnectionSetup(handle, conn)); + } else { + WARN("Not implemented yet!"); + return mscclppInternalError; + } } - // Local intra-node barrier: wait for all local ranks to have written their memory handles - MSCCLPPCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0])); + MSCCLPPCHECK(bootstrapAllGather(comm->bootstrap, handleInfos, comm->nRanks*MAXCONNECTIONS*sizeof(struct ipcMemHandleInfo))); + + // // Local intra-node barrier: wait for all local ranks to have written their memory handles + // MSCCLPPCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0])); MSCCLPPCHECK(mscclppCudaHostCalloc(&comm->devConns, comm->nConns)); @@ -196,7 +220,7 @@ mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm) if (handleInfos_r[i].valid != 1) { break; } - remoteHandles[handleInfos_r[i].tag] = std::make_pair(handleInfos_r[i].handle_buff, handleInfos_r[i].handle_flag); + remoteHandles[handleInfos_r[i].tag] = std::make_pair(handleInfos_r[i].buffHandle, handleInfos_r[i].flagHandle); } for (int i = 0; i < comm->nConns; ++i) { From 81baa738224be01b9df1c2f7117090654d050021 Mon Sep 17 00:00:00 2001 From: v-xiaoxshi Date: Thu, 16 Feb 2023 04:28:36 +0000 Subject: [PATCH 2/8] more progress --- src/bootstrap/init.cc | 129 ++++++++++++++++++++++++++++-------------- src/include/comm.h | 19 +++---- src/include/mscclpp.h | 14 ++++- tests/p2p_test_mpi.cu | 31 +++++----- 4 files changed, 122 insertions(+), 71 deletions(-) diff --git a/src/bootstrap/init.cc b/src/bootstrap/init.cc index bbce5d85..bd74d446 100644 --- a/src/bootstrap/init.cc +++ b/src/bootstrap/init.cc @@ -138,41 +138,59 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm){ MSCCLPP_API(mscclppResult_t, mscclppConnect, mscclppComm_t comm, int rankRecv, int rankSend, void *buff, int *flag, int tag, mscclppTransport_t transportType, const char *ibDev); -mscclppResult_t mscclppConnect(mscclppComm_t comm, int rankRecv, int rankSend, void *buff, int *flag, int tag, +mscclppResult_t mscclppConnect(mscclppComm_t comm, mscclppDevConn* devConnOut, int remoteRank, void* localBuff, int* localFlag, int tag, mscclppTransport_t transportType, const char *ibDev/*=NULL*/) { - if (comm->rank == rankRecv || comm->rank == rankSend) { - struct mscclppConn *conn = &comm->conns[comm->nConns++]; - conn->transport = transportType; - conn->localRank = comm->rank; - conn->ibDev = ibDev; - conn->tag = tag; - conn->buff = buff; - conn->flag = flag; - conn->remoteRank = (comm->rank == rankRecv) ? rankSend : rankRecv; + if (comm->nConns == MAXCONNECTIONS){ + WARN("Too many connections made"); + return mscclppInternalError; } + if (devConnOut == NULL){ + WARN("devConnOut is the output of this function and needs to be allocated by the user"); + return mscclppInvalidUsage; + } + struct mscclppConn *conn = &comm->conns[comm->nConns++]; + conn->transport = transportType; + conn->ibDev = ibDev; + conn->remoteRank = remoteRank; + conn->devConn = devConnOut; + conn->devConn->localBuff = localBuff; + conn->devConn->localFlag = localFlag; + conn->devConn->tag = tag; return mscclppSuccess; } struct ipcMemHandleInfo { cudaIpcMemHandle_t buffHandle; cudaIpcMemHandle_t flagHandle; + int remoteRank; int tag; int valid; // indicates whether the handles are valid }; -mscclppResult_t mscclppP2pConnectionSetup(struct ipcMemHandleInfo* handleInfo /*output*/, struct mscclppConn* conn /*input*/){ +mscclppResult_t mscclppP2pConnectionSetupStart(struct ipcMemHandleInfo* handleInfo /*output*/, struct mscclppConn* conn /*input*/){ if (handleInfo == NULL || conn == NULL){ WARN("ipcHandles or connection cannot be null"); return mscclppInternalError; } - CUDACHECK(cudaIpcGetMemHandle(&handleInfo->buffHandle, conn->buff)); - CUDACHECK(cudaIpcGetMemHandle(&handleInfo->flagHandle, conn->flag)); - handleInfo->tag = conn->tag; + CUDACHECK(cudaIpcGetMemHandle(&handleInfo->buffHandle, conn->devConn->localBuff)); + CUDACHECK(cudaIpcGetMemHandle(&handleInfo->flagHandle, conn->devConn->localFlag)); + handleInfo->remoteRank = conn->devConn->remoteRank; + handleInfo->tag = conn->devConn->tag; handleInfo->valid = 1; return mscclppSuccess; } +mscclppResult_t mscclppP2pConnectionSetupEnd(struct ipcMemHandleInfo* handleInfo /*input*/, struct mscclppConn* conn /*output*/){ + if (handleInfo == NULL || conn == NULL){ + WARN("ipcHandles or connection cannot be null"); + return mscclppInternalError; + } + CUDACHECK(cudaIpcOpenMemHandle(&conn->devConn->remoteBuff, handleInfo->buffHandle, cudaIpcMemLazyEnablePeerAccess)); + CUDACHECK(cudaIpcOpenMemHandle(&conn->devConn->remoteFlag, handleInfo->remoteFlag, cudaIpcMemLazyEnablePeerAccess)); + return mscclppSuccess; +} + MSCCLPP_API(mscclppResult_t, mscclppConnectionSetup, mscclppComm_t comm); mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm) @@ -189,59 +207,84 @@ mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm) // std::string shmname = mscclppShmFileName(comm, comm->localRank); // MSCCLPPCHECK(mscclppShmutilsMapCreate(shmname.c_str(), shmSize, &fd, (void **)&handleInfos)); + // this maps tag * nRanks + remoteRank to the index of local connection + std::map localHandles; for (int i = 0; i < comm->nConns; ++i) { struct mscclppConn *conn = &comm->conns[i]; struct ipcMemHandleInfo* handle = &handleInfos[comm->rank+i]; if (conn->transport == mscclppP2pConnectionSetup){ - MSCCPPCHECK(mscclppP2pConnectionSetup(handle, conn)); + MSCCPPCHECK(mscclppP2pConnectionSetupStart(handle, conn)); } else { WARN("Not implemented yet!"); return mscclppInternalError; } + localHandles[conn->devConn->tag * comm->nRanks + conn->remoteRank] = i; } - MSCCLPPCHECK(bootstrapAllGather(comm->bootstrap, handleInfos, comm->nRanks*MAXCONNECTIONS*sizeof(struct ipcMemHandleInfo))); + MSCCLPPCHECK(bootstrapAllGather(comm->bootstrap, handleInfos, MAXCONNECTIONS*sizeof(struct ipcMemHandleInfo))); // // Local intra-node barrier: wait for all local ranks to have written their memory handles // MSCCLPPCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0])); + - MSCCLPPCHECK(mscclppCudaHostCalloc(&comm->devConns, comm->nConns)); - - for (int r = 0; r < comm->localRanks; ++r) { - if (r == comm->localRank) + for (int r = 0; r < comm->nRanks; ++r) { + if (r == comm->rank) continue; - int fd_r; - struct ipcMemHandleInfo *handleInfos_r; - std::string shmname_r = mscclppShmFileName(comm, r); - MSCCLPPCHECK(mscclppShmutilsMapOpen(shmname_r.c_str(), shmSize, &fd_r, (void **)&handleInfos_r)); - - std::map> remoteHandles; - for (int i = 0; i < MAXCONNECTIONS; ++i) { - if (handleInfos_r[i].valid != 1) { + for (int i = 0; i < MAXCONNECTIONS; i++){ + struct ipcMemHandleInfo* handle = &handleInfos[r*MAXCONNECTIONS+i]; + if (handle->valid != 1){ break; } - remoteHandles[handleInfos_r[i].tag] = std::make_pair(handleInfos_r[i].buffHandle, handleInfos_r[i].flagHandle); - } - - for (int i = 0; i < comm->nConns; ++i) { - struct mscclppConn *conn = &comm->conns[i]; - auto it = remoteHandles.find(conn->tag); - if (it != remoteHandles.end()) { - comm->devConns[i].tag = conn->tag; - comm->devConns[i].localBuff = conn->buff; - comm->devConns[i].localFlag = conn->flag; - CUDACHECK(cudaIpcOpenMemHandle(&comm->devConns[i].remoteBuff, it->second.first, cudaIpcMemLazyEnablePeerAccess)); - CUDACHECK(cudaIpcOpenMemHandle((void **)&comm->devConns[i].remoteFlag, it->second.second, cudaIpcMemLazyEnablePeerAccess)); + if (handle->remoteRank != comm->rank){ + continue; + } + int key = handle->tag * comm->nRanks + r; + if (localHandles.find(key) == localHandles.end()){ + WARN("Cannot find a local connection on rank %d for remote connection rank %d with tag %d", comm->rank, r, handle->tag); + return mscclppInvalidUsage; + } + int localConnIdx = localHandles[key]; + struct mscclppConn *conn = &comm->conns[localConnIdx]; + if (conn->transport == mscclppP2pConnectionSetup){ + MSCCPPCHECK(mscclppP2pConnectionSetupEnd(handle, conn)); + } else { + WARN("Not implemented yet!"); + return mscclppInternalError; } } + free(handleInfos); + // int fd_r; + // struct ipcMemHandleInfo *handleInfos_r; + // std::string shmname_r = mscclppShmFileName(comm, r); + // MSCCLPPCHECK(mscclppShmutilsMapOpen(shmname_r.c_str(), shmSize, &fd_r, (void **)&handleInfos_r)); - MSCCLPPCHECK(mscclppShmutilsMapClose(shmname_r.c_str(), shmSize, fd_r, handleInfos_r)); + // std::map> remoteHandles; + // for (int i = 0; i < MAXCONNECTIONS; ++i) { + // if (handleInfos_r[i].valid != 1) { + // break; + // } + // remoteHandles[handleInfos_r[i].tag] = std::make_pair(handleInfos_r[i].buffHandle, handleInfos_r[i].flagHandle); + // } + + // for (int i = 0; i < comm->nConns; ++i) { + // struct mscclppConn *conn = &comm->conns[i]; + // auto it = remoteHandles.find(conn->tag); + // if (it != remoteHandles.end()) { + // comm->devConns[i].tag = conn->tag; + // comm->devConns[i].localBuff = conn->buff; + // comm->devConns[i].localFlag = conn->flag; + // CUDACHECK(cudaIpcOpenMemHandle(&comm->devConns[i].remoteBuff, it->second.first, cudaIpcMemLazyEnablePeerAccess)); + // CUDACHECK(cudaIpcOpenMemHandle((void **)&comm->devConns[i].remoteFlag, it->second.second, cudaIpcMemLazyEnablePeerAccess)); + // } + // } + + // MSCCLPPCHECK(mscclppShmutilsMapClose(shmname_r.c_str(), shmSize, fd_r, handleInfos_r)); } // Local intra-node barrier: wait for all local ranks to have read all memory handles - MSCCLPPCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0])); + // MSCCLPPCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0])); - MSCCLPPCHECK(mscclppShmutilsMapDestroy(shmname.c_str(), shmSize, fd, handleInfos)); + // MSCCLPPCHECK(mscclppShmutilsMapDestroy(shmname.c_str(), shmSize, fd, handleInfos)); return mscclppSuccess; } diff --git a/src/include/comm.h b/src/include/comm.h index 0d374ab9..5830a2c7 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -157,16 +157,14 @@ // } channels[MAXCHANNELS]; // }; -struct mscclppConn { - mscclppTransport_t transport; - int localRank; - int remoteRank; - const char* ibDev; - int tag; - void* buff; - int* flag; - struct mscclppDevConn *devConn; -}; +// struct mscclppConn { +// mscclppTransport_t transport; +// int remoteRank; +// const char* ibDev; +// int tag; +// void* buff; +// int* flag; +// }; struct mscclppComm { // struct mscclppMemoryStack memPermanent, memScoped; @@ -178,7 +176,6 @@ struct mscclppComm { // struct mscclppTopoSystem* topo; struct mscclppConn conns[MAXCONNECTIONS]; - struct mscclppDevConn *devConns; int nConns; // mscclppNet_t* mscclppNet; diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index 860d7cf3..bab06986 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -34,6 +34,18 @@ struct mscclppDevConn { // virtual void pullRemoteBuff(size_t srcOffset, size_t dstOffset, size_t size); }; +struct mscclppConn { + mscclppTransport_t transport; + // int localRank; + int remoteRank; + const char* ibDev; + // int tag; + // void* buff; + // int* flag; + mscclppDevConn* devConn; +}; + + typedef struct mscclppComm* mscclppComm_t; typedef struct mscclppDevConn* mscclppDevConn_t; @@ -102,7 +114,7 @@ mscclppResult_t mscclppBootStrapAllGather(mscclppComm_t comm, void* data, int si mscclppResult_t mscclppCommDestroy(mscclppComm_t comm); -mscclppResult_t mscclppConnect(mscclppComm_t comm, int rankRecv, int rankSend, void *buff, int *flag, int tag, +mscclppResult_t mscclppConnect(mscclppComm_t comm, mscclppDevConn* devConnOut, int remoteRank, void* localBuff, int* localFlag, int tag, mscclppTransport_t transportType, const char *ibDev=NULL); mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm); diff --git a/tests/p2p_test_mpi.cu b/tests/p2p_test_mpi.cu index 975436a7..9cd5c032 100644 --- a/tests/p2p_test_mpi.cu +++ b/tests/p2p_test_mpi.cu @@ -13,7 +13,7 @@ } \ } while(false) -__global__ void kernel(mscclppDevConn_t devConns, int rank, int world_size) +__global__ void kernel(mscclppDevConn_t* devConns, int rank, int world_size) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid == 0) { @@ -76,36 +76,35 @@ int main(int argc, const char *argv[]) mscclppResult_t res; + mscclppDevConn_t devConns[world_size]; // Read from all other ranks for (int r = 0; r < world_size; ++r) { if (r == rank) continue; - int tag = rank * world_size + r; - res = mscclppConnect(comm, rank, r, data_d, flag_d, tag, mscclppTransportP2P); + int tag = 0; + res = mscclppConnect(comm, &devConn[r], r, data_d, flag_d, tag, mscclppTransportP2P); if (res != mscclppSuccess) { printf("mscclppConnect failed\n"); return -1; } } // Let others read from me - for (int r = 0; r < world_size; ++r) { - if (r == rank) continue; - int tag = r * world_size + rank; - res = mscclppConnect(comm, r, rank, data_d, flag_d, tag, mscclppTransportP2P); - if (res != mscclppSuccess) { - printf("mscclppConnect failed\n"); - return -1; - } - } + // for (int r = 0; r < world_size; ++r) { + // if (r == rank) continue; + // int tag = r * world_size + rank; + // res = mscclppConnect(comm, r, rank, data_d, flag_d, tag, mscclppTransportP2P); + // if (res != mscclppSuccess) { + // printf("mscclppConnect failed\n"); + // return -1; + // } + // } + res = mscclppConnectionSetup(comm); if (res != mscclppSuccess) { printf("mscclppConnectionSetup failed\n"); return -1; } - mscclppDevConn_t devConns; - mscclppGetDevConns(comm, &devConns); - - kernel<<<1, 1>>>(devConns, rank, world_size); + kernel<<<1, 1>>>(devConn, rank, world_size); CUDACHECK(cudaDeviceSynchronize()); int *buf = (int *)calloc(world_size, sizeof(int)); From ad3be20b1583382f91bfc260861c7f17c2bcb635 Mon Sep 17 00:00:00 2001 From: v-xiaoxshi Date: Thu, 16 Feb 2023 04:51:51 +0000 Subject: [PATCH 3/8] compiles now --- src/bootstrap/init.cc | 28 +++++++++++++--------------- src/include/comm.h | 18 ++++++++++-------- src/include/mscclpp.h | 16 ++-------------- tests/p2p_test_mpi.cu | 10 +++++----- 4 files changed, 30 insertions(+), 42 deletions(-) diff --git a/src/bootstrap/init.cc b/src/bootstrap/init.cc index bd74d446..cad25d4b 100644 --- a/src/bootstrap/init.cc +++ b/src/bootstrap/init.cc @@ -136,8 +136,6 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm){ return mscclppSuccess; } -MSCCLPP_API(mscclppResult_t, mscclppConnect, mscclppComm_t comm, int rankRecv, int rankSend, - void *buff, int *flag, int tag, mscclppTransport_t transportType, const char *ibDev); mscclppResult_t mscclppConnect(mscclppComm_t comm, mscclppDevConn* devConnOut, int remoteRank, void* localBuff, int* localFlag, int tag, mscclppTransport_t transportType, const char *ibDev/*=NULL*/) { @@ -175,7 +173,7 @@ mscclppResult_t mscclppP2pConnectionSetupStart(struct ipcMemHandleInfo* handleIn } CUDACHECK(cudaIpcGetMemHandle(&handleInfo->buffHandle, conn->devConn->localBuff)); CUDACHECK(cudaIpcGetMemHandle(&handleInfo->flagHandle, conn->devConn->localFlag)); - handleInfo->remoteRank = conn->devConn->remoteRank; + handleInfo->remoteRank = conn->remoteRank; handleInfo->tag = conn->devConn->tag; handleInfo->valid = 1; return mscclppSuccess; @@ -186,8 +184,8 @@ mscclppResult_t mscclppP2pConnectionSetupEnd(struct ipcMemHandleInfo* handleInfo WARN("ipcHandles or connection cannot be null"); return mscclppInternalError; } - CUDACHECK(cudaIpcOpenMemHandle(&conn->devConn->remoteBuff, handleInfo->buffHandle, cudaIpcMemLazyEnablePeerAccess)); - CUDACHECK(cudaIpcOpenMemHandle(&conn->devConn->remoteFlag, handleInfo->remoteFlag, cudaIpcMemLazyEnablePeerAccess)); + CUDACHECK(cudaIpcOpenMemHandle((void**)&conn->devConn->remoteBuff, handleInfo->buffHandle, cudaIpcMemLazyEnablePeerAccess)); + CUDACHECK(cudaIpcOpenMemHandle((void**)&conn->devConn->remoteFlag, handleInfo->flagHandle, cudaIpcMemLazyEnablePeerAccess)); return mscclppSuccess; } @@ -212,8 +210,8 @@ mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm) for (int i = 0; i < comm->nConns; ++i) { struct mscclppConn *conn = &comm->conns[i]; struct ipcMemHandleInfo* handle = &handleInfos[comm->rank+i]; - if (conn->transport == mscclppP2pConnectionSetup){ - MSCCPPCHECK(mscclppP2pConnectionSetupStart(handle, conn)); + if (conn->transport == mscclppTransportP2P){ + MSCCLPPCHECK(mscclppP2pConnectionSetupStart(handle, conn)); } else { WARN("Not implemented yet!"); return mscclppInternalError; @@ -245,8 +243,8 @@ mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm) } int localConnIdx = localHandles[key]; struct mscclppConn *conn = &comm->conns[localConnIdx]; - if (conn->transport == mscclppP2pConnectionSetup){ - MSCCPPCHECK(mscclppP2pConnectionSetupEnd(handle, conn)); + if (conn->transport == mscclppTransportP2P){ + MSCCLPPCHECK(mscclppP2pConnectionSetupEnd(handle, conn)); } else { WARN("Not implemented yet!"); return mscclppInternalError; @@ -289,9 +287,9 @@ mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm) return mscclppSuccess; } -MSCCLPP_API(mscclppResult_t, mscclppGetDevConns, mscclppComm_t comm, mscclppDevConn_t* devConns); -mscclppResult_t mscclppGetDevConns(mscclppComm_t comm, mscclppDevConn_t* devConns) -{ - *devConns = comm->devConns; - return mscclppSuccess; -} +// MSCCLPP_API(mscclppResult_t, mscclppGetDevConns, mscclppComm_t comm, mscclppDevConn_t* devConns); +// mscclppResult_t mscclppGetDevConns(mscclppComm_t comm, mscclppDevConn_t* devConns) +// { +// *devConns = comm->devConns; +// return mscclppSuccess; +// } diff --git a/src/include/comm.h b/src/include/comm.h index 5830a2c7..91a02fd9 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -157,14 +157,16 @@ // } channels[MAXCHANNELS]; // }; -// struct mscclppConn { -// mscclppTransport_t transport; -// int remoteRank; -// const char* ibDev; -// int tag; -// void* buff; -// int* flag; -// }; +struct mscclppConn { + mscclppTransport_t transport; + // int localRank; + int remoteRank; + const char* ibDev; + // int tag; + // void* buff; + // int* flag; + struct mscclppDevConn* devConn; +}; struct mscclppComm { // struct mscclppMemoryStack memPermanent, memScoped; diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index bab06986..0a030657 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -34,20 +34,8 @@ struct mscclppDevConn { // virtual void pullRemoteBuff(size_t srcOffset, size_t dstOffset, size_t size); }; -struct mscclppConn { - mscclppTransport_t transport; - // int localRank; - int remoteRank; - const char* ibDev; - // int tag; - // void* buff; - // int* flag; - mscclppDevConn* devConn; -}; - - typedef struct mscclppComm* mscclppComm_t; -typedef struct mscclppDevConn* mscclppDevConn_t; +typedef struct mscclppDevConn mscclppDevConn_t; #define MSCCLPP_UNIQUE_ID_BYTES 128 typedef struct { char internal[MSCCLPP_UNIQUE_ID_BYTES]; } mscclppUniqueId; @@ -119,7 +107,7 @@ mscclppResult_t mscclppConnect(mscclppComm_t comm, mscclppDevConn* devConnOut, i mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm); -mscclppResult_t mscclppGetDevConns(mscclppComm_t comm, mscclppDevConn_t* devConns); +// mscclppResult_t mscclppGetDevConns(mscclppComm_t comm, mscclppDevConn_t* devConns); #ifdef __cplusplus } // end extern "C" diff --git a/tests/p2p_test_mpi.cu b/tests/p2p_test_mpi.cu index 9cd5c032..51e00b92 100644 --- a/tests/p2p_test_mpi.cu +++ b/tests/p2p_test_mpi.cu @@ -13,7 +13,7 @@ } \ } while(false) -__global__ void kernel(mscclppDevConn_t* devConns, int rank, int world_size) +__global__ void kernel(mscclppDevConn_t devConns[8], int rank, int world_size) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid == 0) { @@ -26,7 +26,7 @@ __global__ void kernel(mscclppDevConn_t* devConns, int rank, int world_size) *flag = 1; for (int i = 0; i < (world_size - 1) * 2; ++i) { - mscclppDevConn_t devConn = &devConns[i]; + mscclppDevConn_t* devConn = &devConns[i]; int tag = devConn->tag; int rankRecv = tag / world_size; int rankSend = tag % world_size; @@ -76,12 +76,12 @@ int main(int argc, const char *argv[]) mscclppResult_t res; - mscclppDevConn_t devConns[world_size]; + mscclppDevConn_t devConns[8]; // Read from all other ranks for (int r = 0; r < world_size; ++r) { if (r == rank) continue; int tag = 0; - res = mscclppConnect(comm, &devConn[r], r, data_d, flag_d, tag, mscclppTransportP2P); + res = mscclppConnect(comm, &devConns[r], r, data_d, flag_d, tag, mscclppTransportP2P); if (res != mscclppSuccess) { printf("mscclppConnect failed\n"); return -1; @@ -104,7 +104,7 @@ int main(int argc, const char *argv[]) return -1; } - kernel<<<1, 1>>>(devConn, rank, world_size); + kernel<<<1, 1>>>(devConns, rank, world_size); CUDACHECK(cudaDeviceSynchronize()); int *buf = (int *)calloc(world_size, sizeof(int)); From a364a39d17c213c2d926a0d7891b1a9adf8e6561 Mon Sep 17 00:00:00 2001 From: v-xiaoxshi Date: Thu, 16 Feb 2023 05:25:17 +0000 Subject: [PATCH 4/8] compiles now --- src/bootstrap/init.cc | 2 ++ tests/p2p_test_mpi.cu | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/bootstrap/init.cc b/src/bootstrap/init.cc index cad25d4b..49740cd3 100644 --- a/src/bootstrap/init.cc +++ b/src/bootstrap/init.cc @@ -136,6 +136,8 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm){ return mscclppSuccess; } +MSCCLPP_API(mscclppResult_t, mscclppConnect, mscclppComm_t comm, mscclppDevConn* devConnOut, int remoteRank, void* localBuff, int* localFlag, int tag, + mscclppTransport_t transportType, const char *ibDev); mscclppResult_t mscclppConnect(mscclppComm_t comm, mscclppDevConn* devConnOut, int remoteRank, void* localBuff, int* localFlag, int tag, mscclppTransport_t transportType, const char *ibDev/*=NULL*/) { diff --git a/tests/p2p_test_mpi.cu b/tests/p2p_test_mpi.cu index 51e00b92..e9141099 100644 --- a/tests/p2p_test_mpi.cu +++ b/tests/p2p_test_mpi.cu @@ -63,6 +63,8 @@ int main(int argc, const char *argv[]) int world_size; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &world_size); + CUDACHECK(cudaSetDevice(rank % 8)); + printf("Starting rank %d of %d\n", rank, world_size); mscclppComm_t comm; const char *ip_port = argv[1]; From 654dd5f1729f4d0f84512937d7f503041d166a31 Mon Sep 17 00:00:00 2001 From: v-xiaoxshi Date: Thu, 16 Feb 2023 07:07:13 +0000 Subject: [PATCH 5/8] works --- src/bootstrap/init.cc | 60 +++++++++++++++++++++---------------------- src/include/comm.h | 16 ++++++------ tests/p2p_test_mpi.cu | 33 ++++++++++++++++-------- 3 files changed, 60 insertions(+), 49 deletions(-) diff --git a/src/bootstrap/init.cc b/src/bootstrap/init.cc index 49740cd3..2c7b5d85 100644 --- a/src/bootstrap/init.cc +++ b/src/bootstrap/init.cc @@ -81,36 +81,36 @@ mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, int rank, c MSCCLPPCHECKGOTO(mscclppCudaHostCalloc((uint32_t **)&_comm->abortFlag, 1), res, fail); MSCCLPPCHECK(bootstrapInit(&handle, _comm)); - _comm->maxLocalRanks = 8; - MSCCLPPCHECKGOTO(mscclppCalloc(&_comm->rankToNode, nranks), res, fail); - MSCCLPPCHECKGOTO(mscclppCalloc(&_comm->rankToLocalRank, nranks), res, fail); - MSCCLPPCHECKGOTO(mscclppCalloc(&_comm->localRankToRank, _comm->maxLocalRanks), res, fail); + // _comm->maxLocalRanks = 8; + // MSCCLPPCHECKGOTO(mscclppCalloc(&_comm->rankToNode, nranks), res, fail); + // MSCCLPPCHECKGOTO(mscclppCalloc(&_comm->rankToLocalRank, nranks), res, fail); + // MSCCLPPCHECKGOTO(mscclppCalloc(&_comm->localRankToRank, _comm->maxLocalRanks), res, fail); - MSCCLPPCHECKGOTO(mscclppCalloc(&hashes, nranks), res, fail); - hashes[rank] = hash; - MSCCLPPCHECK(bootstrapAllGather(_comm->bootstrap, hashes, sizeof(uint64_t))); + // MSCCLPPCHECKGOTO(mscclppCalloc(&hashes, nranks), res, fail); + // hashes[rank] = hash; + // MSCCLPPCHECK(bootstrapAllGather(_comm->bootstrap, hashes, sizeof(uint64_t))); - for (int i = 0; i < nranks; ++i) { - auto it = hashToNode.find(hashes[i]); - if (it == hashToNode.end()) { - _comm->nNodes++; - hashToNode[hashes[i]] = _comm->nNodes - 1; - _comm->rankToNode[i] = _comm->nNodes - 1; - } else { - _comm->rankToNode[i] = it->second; - } - if (hashes[i] == hash) { - _comm->rankToLocalRank[i] = _comm->localRanks++; - _comm->localRankToRank[_comm->rankToLocalRank[i]] = i; - } - } - if (_comm->localRanks > _comm->maxLocalRanks) { - WARN("Too many ranks on the same host: %d", _comm->localRanks); - res = mscclppInvalidUsage; - goto fail; - } - _comm->node = _comm->rankToNode[rank]; - _comm->localRank = _comm->rankToLocalRank[rank]; + // for (int i = 0; i < nranks; ++i) { + // auto it = hashToNode.find(hashes[i]); + // if (it == hashToNode.end()) { + // _comm->nNodes++; + // hashToNode[hashes[i]] = _comm->nNodes - 1; + // _comm->rankToNode[i] = _comm->nNodes - 1; + // } else { + // _comm->rankToNode[i] = it->second; + // } + // if (hashes[i] == hash) { + // _comm->rankToLocalRank[i] = _comm->localRanks++; + // _comm->localRankToRank[_comm->rankToLocalRank[i]] = i; + // } + // } + // if (_comm->localRanks > _comm->maxLocalRanks) { + // WARN("Too many ranks on the same host: %d", _comm->localRanks); + // res = mscclppInvalidUsage; + // goto fail; + // } + // _comm->node = _comm->rankToNode[rank]; + // _comm->localRank = _comm->rankToLocalRank[rank]; *comm = _comm; return res; @@ -211,7 +211,7 @@ mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm) std::map localHandles; for (int i = 0; i < comm->nConns; ++i) { struct mscclppConn *conn = &comm->conns[i]; - struct ipcMemHandleInfo* handle = &handleInfos[comm->rank+i]; + struct ipcMemHandleInfo* handle = &handleInfos[comm->rank*MAXCONNECTIONS+i]; if (conn->transport == mscclppTransportP2P){ MSCCLPPCHECK(mscclppP2pConnectionSetupStart(handle, conn)); } else { @@ -252,7 +252,6 @@ mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm) return mscclppInternalError; } } - free(handleInfos); // int fd_r; // struct ipcMemHandleInfo *handleInfos_r; // std::string shmname_r = mscclppShmFileName(comm, r); @@ -280,6 +279,7 @@ mscclppResult_t mscclppConnectionSetup(mscclppComm_t comm) // MSCCLPPCHECK(mscclppShmutilsMapClose(shmname_r.c_str(), shmSize, fd_r, handleInfos_r)); } + free(handleInfos); // Local intra-node barrier: wait for all local ranks to have read all memory handles // MSCCLPPCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0])); diff --git a/src/include/comm.h b/src/include/comm.h index 91a02fd9..f06da539 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -196,14 +196,14 @@ struct mscclppComm { // int64_t busId; // my PCI bus ID in int format // cpu_set_t cpuAffinity; // CPU affinity of the GPU - int node; - int nNodes; - int localRank; - int localRanks; - int maxLocalRanks; - int* rankToNode; - int* rankToLocalRank; - int* localRankToRank; + // int node; + // int nNodes; + // int localRank; + // int localRanks; + // int maxLocalRanks; + // int* rankToNode; + // int* rankToLocalRank; + // int* localRankToRank; // // localRanks and localRanktoRank for all nodes // struct mscclppNodeRanks* nodeRanks; diff --git a/tests/p2p_test_mpi.cu b/tests/p2p_test_mpi.cu index e9141099..217fedc3 100644 --- a/tests/p2p_test_mpi.cu +++ b/tests/p2p_test_mpi.cu @@ -13,34 +13,38 @@ } \ } while(false) -__global__ void kernel(mscclppDevConn_t devConns[8], int rank, int world_size) +__constant__ mscclppDevConn_t constDevConns[8]; + +__global__ void kernel(int rank, int world_size) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid == 0) { // Set my data - volatile int *data = (volatile int *)devConns[rank].localBuff; - volatile int *flag = (volatile int *)devConns[rank].localFlag; + volatile int *data = (volatile int *)constDevConns[(rank+1) % world_size].localBuff; + volatile int *flag = (volatile int *)constDevConns[(rank+1) % world_size].localFlag; data[rank] = rank; // Inform that the data is set *flag = 1; - for (int i = 0; i < (world_size - 1) * 2; ++i) { - mscclppDevConn_t* devConn = &devConns[i]; - int tag = devConn->tag; - int rankRecv = tag / world_size; - int rankSend = tag % world_size; + for (int i = 0; i < world_size; ++i) { + if (i == rank) continue; + mscclppDevConn_t* devConn = &constDevConns[i]; + // int tag = devConn->tag; + // int rankRecv = tag / world_size; + // int rankSend = tag % world_size; - if (rankRecv != rank) continue; + // if (rankRecv != rank) continue; volatile int *remoteData = (volatile int *)devConn->remoteBuff; volatile int *remoteFlag = (volatile int *)devConn->remoteFlag; + // printf("i = %d ptr1 %p, ptr2 %p\n", i,remoteData, remoteFlag); // Wait until the remote data is set while (*remoteFlag != 1) {} // Read remote data - data[rankSend] = remoteData[rankSend]; + data[i] = remoteData[i]; } } } @@ -78,6 +82,11 @@ int main(int argc, const char *argv[]) mscclppResult_t res; + // if (rank == 0) + // sleep(10); + // else + // sleep(10); + mscclppDevConn_t devConns[8]; // Read from all other ranks for (int r = 0; r < world_size; ++r) { @@ -89,6 +98,7 @@ int main(int argc, const char *argv[]) return -1; } } + // Let others read from me // for (int r = 0; r < world_size; ++r) { // if (r == rank) continue; @@ -106,7 +116,8 @@ int main(int argc, const char *argv[]) return -1; } - kernel<<<1, 1>>>(devConns, rank, world_size); + CUDACHECK(cudaMemcpyToSymbol(constDevConns, devConns, sizeof(mscclppDevConn_t) * world_size)); + kernel<<<1, 1>>>(rank, world_size); CUDACHECK(cudaDeviceSynchronize()); int *buf = (int *)calloc(world_size, sizeof(int)); From 71fbf283d70582cfaf2ebb33253920557eda055a Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Thu, 16 Feb 2023 07:08:38 +0000 Subject: [PATCH 6/8] works --- src/include/comm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/comm.h b/src/include/comm.h index f06da539..c005d555 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -203,7 +203,7 @@ struct mscclppComm { // int maxLocalRanks; // int* rankToNode; // int* rankToLocalRank; - // int* localRankToRank; + // int* localRankToRank; // // localRanks and localRanktoRank for all nodes // struct mscclppNodeRanks* nodeRanks; From 4f3418aa77acf4b0beb41f67790285d2f467ec62 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Thu, 16 Feb 2023 07:13:04 +0000 Subject: [PATCH 7/8] more clean up --- Makefile | 2 +- src/bootstrap/init.cc | 7 ++-- src/bootstrap/shmutils.cc | 80 --------------------------------------- src/include/shmutils.h | 11 ------ 4 files changed, 4 insertions(+), 96 deletions(-) delete mode 100644 src/bootstrap/shmutils.cc delete mode 100644 src/include/shmutils.h diff --git a/Makefile b/Makefile index 0ebc2796..e9d1f45c 100644 --- a/Makefile +++ b/Makefile @@ -93,7 +93,7 @@ OBJDIR := obj BINDIR := bin LIBSRCS := $(addprefix src/,debug.cc utils.cc param.cc) -LIBSRCS += $(addprefix src/bootstrap/,init.cc bootstrap.cc socket.cc proxy.cc shmutils.cc) +LIBSRCS += $(addprefix src/bootstrap/,init.cc bootstrap.cc socket.cc proxy.cc) LIBOBJS := $(patsubst %.cc,%.o,$(LIBSRCS)) LIBOBJTARGETS := $(LIBOBJS:%=$(BUILDDIR)/$(OBJDIR)/%) diff --git a/src/bootstrap/init.cc b/src/bootstrap/init.cc index 2c7b5d85..b8142ed7 100644 --- a/src/bootstrap/init.cc +++ b/src/bootstrap/init.cc @@ -1,7 +1,6 @@ #include "mscclpp.h" #include "bootstrap.h" #include "core.h" -#include "shmutils.h" #include #include @@ -65,9 +64,9 @@ MSCCLPP_API(mscclppResult_t, mscclppCommInitRank, mscclppComm_t* comm, int nrank mscclppResult_t mscclppCommInitRank(mscclppComm_t* comm, int nranks, int rank, const char* ip_port_pair){ mscclppResult_t res = mscclppSuccess; mscclppComm_t _comm = NULL; - uint64_t hash = getHostHash(); - uint64_t *hashes; - std::map hashToNode; + // uint64_t hash = getHostHash(); + // uint64_t *hashes; + // std::map hashToNode; MSCCLPPCHECKGOTO(mscclppCalloc(&_comm, 1), res, fail); _comm->rank = rank; diff --git a/src/bootstrap/shmutils.cc b/src/bootstrap/shmutils.cc deleted file mode 100644 index eebe4e52..00000000 --- a/src/bootstrap/shmutils.cc +++ /dev/null @@ -1,80 +0,0 @@ -#include "shmutils.h" -#include "debug.h" -#include -#include -#include -#include - -#define SHM_MODE 0666 - -// Open a shme file and create an mmap. -static mscclppResult_t shmutilsMapOpen(const char *name, size_t size, int *fd, void **map, int flag) -{ - int _fd = shm_open(name, flag, SHM_MODE); - if (_fd == -1) { - WARN("Failed to open shm file %s (flag: %d)", name, flag); - return mscclppInternalError; - } - void *_map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, _fd, 0); - if (_map == MAP_FAILED) { - WARN("Failed to mmap shm file %s", name); - goto fail; - } - if (flag & O_CREAT) { - if (ftruncate(_fd, 0) == -1) { - WARN("Failed to ftruncate shm file %s", name); - goto fail; - } - } - if (ftruncate(_fd, size) == -1) { - WARN("Failed to ftruncate shm file %s", name); - goto fail; - } - *fd = _fd; - *map = _map; - return mscclppSuccess; -fail: - close(_fd); - shm_unlink(name); - return mscclppInternalError; -} - -// Open or create a shm file. -mscclppResult_t mscclppShmutilsMapCreate(const char *name, size_t size, int *fd, void **map) -{ - return shmutilsMapOpen(name, size, fd, map, O_CREAT | O_RDWR); -} - -// Open an existing shm file. -mscclppResult_t mscclppShmutilsMapOpen(const char *name, size_t size, int *fd, void **map) -{ - return shmutilsMapOpen(name, size, fd, map, O_RDWR); -} - -// Close a shm mmap. -mscclppResult_t mscclppShmutilsMapClose(const char *name, size_t size, int fd, void *map) -{ - int err = 0; - if (munmap(map, size) == -1) { - WARN("Failed to munmap shm file %s", name); - err = 1; - } - close(fd); - return err ? mscclppInternalError : mscclppSuccess; -} - -// Close a shm mmap and destroy a shm file. -mscclppResult_t mscclppShmutilsMapDestroy(const char *name, size_t size, int fd, void *map) -{ - int err = 0; - if (munmap(map, size) == -1) { - WARN("Failed to munmap shm file %s", name); - err = 1; - } - close(fd); - if (shm_unlink(name) == -1) { - WARN("Failed to unlink shm file %s: errno %d", name, errno); - err = 1; - } - return err ? mscclppInternalError : mscclppSuccess; -} diff --git a/src/include/shmutils.h b/src/include/shmutils.h deleted file mode 100644 index 21dfa011..00000000 --- a/src/include/shmutils.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef MSCCLPP_SHMUTILS_H_ -#define MSCCLPP_SHMUTILS_H_ - -#include "mscclpp.h" - -mscclppResult_t mscclppShmutilsMapCreate(const char *name, size_t size, int *fd, void **map); -mscclppResult_t mscclppShmutilsMapOpen(const char *name, size_t size, int *fd, void **map); -mscclppResult_t mscclppShmutilsMapClose(const char *name, size_t size, int fd, void *map); -mscclppResult_t mscclppShmutilsMapDestroy(const char *name, size_t size, int fd, void *map); - -#endif From ca89c17aaa3c3b362f2eb55caa798a6a4949b444 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Mon, 20 Feb 2023 00:23:24 +0000 Subject: [PATCH 8/8] more clean up --- tests/p2p_test_mpi.cu | 47 ++++++++++++++----------------------------- 1 file changed, 15 insertions(+), 32 deletions(-) diff --git a/tests/p2p_test_mpi.cu b/tests/p2p_test_mpi.cu index 217fedc3..4005d111 100644 --- a/tests/p2p_test_mpi.cu +++ b/tests/p2p_test_mpi.cu @@ -4,6 +4,15 @@ #include #include +#define MSCCLPPCHECK(call) do { \ + mscclppResult_t res = call; \ + if (res != mscclppSuccess && res != mscclppInProgress) { \ + /* Print the back trace*/ \ + printf("Failure at %s:%d -> %d", __FILE__, __LINE__, res); \ + return res; \ + } \ +} while (0); + // Check CUDA RT calls #define CUDACHECK(cmd) do { \ cudaError_t err = cmd; \ @@ -82,42 +91,20 @@ int main(int argc, const char *argv[]) mscclppResult_t res; - // if (rank == 0) - // sleep(10); - // else - // sleep(10); - mscclppDevConn_t devConns[8]; // Read from all other ranks for (int r = 0; r < world_size; ++r) { if (r == rank) continue; int tag = 0; - res = mscclppConnect(comm, &devConns[r], r, data_d, flag_d, tag, mscclppTransportP2P); - if (res != mscclppSuccess) { - printf("mscclppConnect failed\n"); - return -1; - } - } - - // Let others read from me - // for (int r = 0; r < world_size; ++r) { - // if (r == rank) continue; - // int tag = r * world_size + rank; - // res = mscclppConnect(comm, r, rank, data_d, flag_d, tag, mscclppTransportP2P); - // if (res != mscclppSuccess) { - // printf("mscclppConnect failed\n"); - // return -1; - // } - // } - - res = mscclppConnectionSetup(comm); - if (res != mscclppSuccess) { - printf("mscclppConnectionSetup failed\n"); - return -1; + MSCCLPPCHECK(mscclppConnect(comm, &devConns[r], r, data_d, flag_d, tag, mscclppTransportP2P)); } + MSCCLPPCHECK(mscclppConnectionSetup(comm)); CUDACHECK(cudaMemcpyToSymbol(constDevConns, devConns, sizeof(mscclppDevConn_t) * world_size)); + + kernel<<<1, 1>>>(rank, world_size); + CUDACHECK(cudaDeviceSynchronize()); int *buf = (int *)calloc(world_size, sizeof(int)); @@ -134,11 +121,7 @@ int main(int argc, const char *argv[]) } } - res = mscclppCommDestroy(comm); - if (res != mscclppSuccess) { - printf("mscclppDestroy failed\n"); - return -1; - } + MSCCLPPCHECK(mscclppCommDestroy(comm)); MPI_Finalize();