added mscclppGetErrorString

This commit is contained in:
Saeed Maleki
2023-03-24 22:57:14 +00:00
parent 0f31dafed5
commit 0898214f0a
4 changed files with 27 additions and 34 deletions

View File

@@ -35,35 +35,4 @@
ret func(args)
#endif // end PROFAPI
/*
static __inline__ int mscclppTypeSize(mscclppDataType_t type) {
switch (type) {
case mscclppInt8:
case mscclppUint8:
return 1;
case mscclppFloat16:
#if defined(__CUDA_BF16_TYPES_EXIST__)
case mscclppBfloat16:
#endif
return 2;
case mscclppInt32:
case mscclppUint32:
case mscclppFloat32:
return 4;
case mscclppInt64:
case mscclppUint64:
case mscclppFloat64:
return 8;
default:
return -1;
}
}
#include "debug.h"
#include "checks.h"
#include "cudawrap.h"
#include "utils.h"
#include "nvtx.h"
*/
#endif // end include guard

View File

@@ -198,6 +198,16 @@ mscclppResult_t mscclppBootstrapAllGather(mscclppComm_t comm, void* data, int si
*/
mscclppResult_t mscclppCommDestroy(mscclppComm_t comm);
/* Return the string for the given error code.
*
* Ouput:
* returns the string
*
* Inputs:
* result: the error code that this function needs to translate
*/
const char* mscclppGetErrorString(mscclppResult_t result);
/* Connect to a remote rank. This function only prepares metadata for connection. The actual connection
* is made by a following call of mscclppConnectionSetup(). Note that this function is two-way and a connection
* from rank i to remote rank j needs to have a counterpart from rank j to rank i.

View File

@@ -255,6 +255,20 @@ mscclppResult_t mscclppCommDestroy(mscclppComm_t comm){
return mscclppSuccess;
}
MSCCLPP_API(const char*, mscclppGetErrorString, mscclppResult_t code);
const char* mscclppGetErrorString(mscclppResult_t code) {
switch (code) {
case mscclppSuccess : return "no error";
case mscclppUnhandledCudaError : return "unhandled cuda error";
case mscclppSystemError : return "unhandled system error";
case mscclppInternalError : return "internal error";
case mscclppInvalidArgument : return "invalid argument";
case mscclppInvalidUsage : return "invalid usage";
case mscclppRemoteError : return "remote process exited or there was a network error";
case mscclppInProgress : return "MSCCL++ operation in progress";
default : return "unknown result code";
}
}
MSCCLPP_API(mscclppResult_t, mscclppGetDeviceConnection, mscclppComm_t comm, int remoteRank, int tag, mscclppDevConn_t** devConn);
mscclppResult_t mscclppGetDeviceConnection(mscclppComm_t comm, int remoteRank, int tag, mscclppDevConn_t** devConn){

View File

@@ -18,9 +18,9 @@ static int nranksPerNode = 8;
#define MSCCLPPCHECK(call) do { \
mscclppResult_t res = call; \
if (res != mscclppSuccess && res != mscclppInProgress) { \
/* Print the back trace*/ \
printf("Failure at %s:%d -> %d\n", __FILE__, __LINE__, res); \
return res; \
/* Print the back trace*/ \
printf("Failure at %s:%d -> %s\n", __FILE__, __LINE__, mscclppGetErrorString(res)); \
return res; \
} \
} while (0)