diff --git a/src/core/atomicadd_kernel.cu b/src/core/atomicadd_kernel.cu index 41dd2a33..779c6c1b 100644 --- a/src/core/atomicadd_kernel.cu +++ b/src/core/atomicadd_kernel.cu @@ -24,6 +24,14 @@ void CudaIpcStream::atomicAdd(uint64_t* dst, int64_t value) { // The CUDA runtime uses a per-context lock; the main thread holds it while waiting for // the test kernel, and the proxy thread needs it to launch the atomicAdd kernel. // A separate CUDA context avoids this contention. + // + // TODO(#796): `dst` is a CUDA-IPC mapping registered in the primary/runtime context, so + // launching this kernel from `proxyAtomicCtx_` is technically UB (device pointers are + // context-scoped). It works in practice on current drivers because the IPC handle aliases + // the same physical allocation, but a correct fix would either (a) avoid the separate + // context (e.g. break the deadlock differently) or (b) re-open the IPC mapping inside + // `proxyAtomicCtx_`. Carried over from the DeepEP `chhwang/dev-atomic-add-cleanup` + // cherry-pick; revisit before this lands on `main`. if (!proxyAtomicCtx_) { CUdevice cuDevice; CUresult res = cuDeviceGet(&cuDevice, deviceId_); diff --git a/src/core/context.cc b/src/core/context.cc index 4913da06..84ba4190 100644 --- a/src/core/context.cc +++ b/src/core/context.cc @@ -63,6 +63,14 @@ void CudaIpcStream::sync() { // operations that complete asynchronously on the GPU. Syncing them here would deadlock // because sync() is called from the proxy thread while the main thread may hold the // device context via cudaStreamSynchronize() on the test kernel's stream. + // + // TODO(#796): As a side effect, `Connection::flush()` does not order/complete pending + // remote `atomicAdd` operations on the CUDA-IPC transport, so PortChannel flush no + // longer guarantees that a peer kernel sees the updated value. EP currently relies on + // higher-level signaling (PortChannel signal/wait, FIFO drain) for ordering, but a + // correct fix needs a deadlock-free way to drain `proxyAtomicStream_` here. Carried + // over from the DeepEP `chhwang/dev-atomic-add-cleanup` cherry-pick; revisit before + // this lands on `main`. } IbCtx* Context::Impl::getIbContext(Transport ibTransport) {