This commit is contained in:
Changho Hwang
2023-04-12 09:25:35 +00:00
parent 63a5be6953
commit dd0883b84f
5 changed files with 74 additions and 69 deletions

View File

@@ -20,12 +20,12 @@
} \
} while (false)
#define CUDACHECKNORET(cmd) \
#define CUDACHECKNORET(cmd) \
do { \
cudaError_t err = cmd; \
if (err != cudaSuccess) { \
WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
return; \
return; \
} \
} while (false)

View File

@@ -16,8 +16,6 @@
#define MAXCONNECTIONS 64
struct mscclppConn
{
int connId;
@@ -42,8 +40,8 @@ struct mscclppComm
void* bootstrap;
uint64_t
magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
// Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
uint64_t magic;
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator

View File

@@ -29,7 +29,6 @@ struct alignas(16) mscclppDevConnSignalEpochId
uint64_t proxy;
};
/***************************************************************************************************************
* A mscclppDevConn provides a zero-copy connection between two GPUs connected via P2P NVLink or InfiniBand.
* The communication API is one-sided meaning that for every single data transfer, only one side
@@ -183,11 +182,11 @@ struct mscclppDevConn
// my remote peer's buffer. only non-NULL with gpu's direct access
// gpu can directly write into it
void* remoteBuff;
};
// Host interface for mscclppDevCon functionality
struct mscclppHostConn{
struct mscclppHostConn
{
virtual ~mscclppHostConn() = default;
virtual void put(uint64_t dstDataOffset, uint64_t srcDataOffset, uint64_t dataSize) = 0;
virtual void signal() = 0;