From 0898214f0aaa726879e9aa82dcd60d97075258aa Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Fri, 24 Mar 2023 22:57:14 +0000 Subject: [PATCH] added mscclppGetErrorString --- src/include/core.h | 31 ------------------------------- src/include/mscclpp.h | 10 ++++++++++ src/init.cc | 14 ++++++++++++++ tests/allgather_test.cu | 6 +++--- 4 files changed, 27 insertions(+), 34 deletions(-) diff --git a/src/include/core.h b/src/include/core.h index eba530e1..8e6fc902 100644 --- a/src/include/core.h +++ b/src/include/core.h @@ -35,35 +35,4 @@ ret func(args) #endif // end PROFAPI -/* -static __inline__ int mscclppTypeSize(mscclppDataType_t type) { - switch (type) { - case mscclppInt8: - case mscclppUint8: - return 1; - case mscclppFloat16: -#if defined(__CUDA_BF16_TYPES_EXIST__) - case mscclppBfloat16: -#endif - return 2; - case mscclppInt32: - case mscclppUint32: - case mscclppFloat32: - return 4; - case mscclppInt64: - case mscclppUint64: - case mscclppFloat64: - return 8; - default: - return -1; - } -} - -#include "debug.h" -#include "checks.h" -#include "cudawrap.h" -#include "utils.h" -#include "nvtx.h" -*/ - #endif // end include guard diff --git a/src/include/mscclpp.h b/src/include/mscclpp.h index 6120f120..ff74f5ff 100644 --- a/src/include/mscclpp.h +++ b/src/include/mscclpp.h @@ -198,6 +198,16 @@ mscclppResult_t mscclppBootstrapAllGather(mscclppComm_t comm, void* data, int si */ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm); +/* Return the string for the given error code. + * + * Ouput: + * returns the string + * + * Inputs: + * result: the error code that this function needs to translate + */ +const char* mscclppGetErrorString(mscclppResult_t result); + /* Connect to a remote rank. This function only prepares metadata for connection. The actual connection * is made by a following call of mscclppConnectionSetup(). Note that this function is two-way and a connection * from rank i to remote rank j needs to have a counterpart from rank j to rank i. diff --git a/src/init.cc b/src/init.cc index e6bb327c..4481c5a2 100644 --- a/src/init.cc +++ b/src/init.cc @@ -255,6 +255,20 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm){ return mscclppSuccess; } +MSCCLPP_API(const char*, mscclppGetErrorString, mscclppResult_t code); +const char* mscclppGetErrorString(mscclppResult_t code) { + switch (code) { + case mscclppSuccess : return "no error"; + case mscclppUnhandledCudaError : return "unhandled cuda error"; + case mscclppSystemError : return "unhandled system error"; + case mscclppInternalError : return "internal error"; + case mscclppInvalidArgument : return "invalid argument"; + case mscclppInvalidUsage : return "invalid usage"; + case mscclppRemoteError : return "remote process exited or there was a network error"; + case mscclppInProgress : return "MSCCL++ operation in progress"; + default : return "unknown result code"; + } +} MSCCLPP_API(mscclppResult_t, mscclppGetDeviceConnection, mscclppComm_t comm, int remoteRank, int tag, mscclppDevConn_t** devConn); mscclppResult_t mscclppGetDeviceConnection(mscclppComm_t comm, int remoteRank, int tag, mscclppDevConn_t** devConn){ diff --git a/tests/allgather_test.cu b/tests/allgather_test.cu index 095a5dbd..a3c0c9ca 100644 --- a/tests/allgather_test.cu +++ b/tests/allgather_test.cu @@ -18,9 +18,9 @@ static int nranksPerNode = 8; #define MSCCLPPCHECK(call) do { \ mscclppResult_t res = call; \ if (res != mscclppSuccess && res != mscclppInProgress) { \ - /* Print the back trace*/ \ - printf("Failure at %s:%d -> %d\n", __FILE__, __LINE__, res); \ - return res; \ + /* Print the back trace*/ \ + printf("Failure at %s:%d -> %s\n", __FILE__, __LINE__, mscclppGetErrorString(res)); \ + return res; \ } \ } while (0)