mscclpp/test/mp_unit/ib_tests.cu

// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#include <mpi.h>

#include <mscclpp/cuda_utils.hpp>

#include "infiniband/verbs.h"
#include "mp_unit_tests.hpp"

void IbTestBase::SetUp() {
  MSCCLPP_CUDATHROW(cudaGetDeviceCount(&cudaDevNum));
  cudaDevId = (gEnv->rank % gEnv->nRanksPerNode) % cudaDevNum;
  MSCCLPP_CUDATHROW(cudaSetDevice(cudaDevId));

  int ibDevId = (gEnv->rank % gEnv->nRanksPerNode) / mscclpp::getIBDeviceCount();
  ibDevName = mscclpp::getIBDeviceName(ibIdToTransport(ibDevId));
}

void IbPeerToPeerTest::SetUp() {
  IbTestBase::SetUp();

  mscclpp::UniqueId id;

  if (gEnv->rank < 2) {
    // This test needs only two ranks
    bootstrap = std::make_shared<mscclpp::TcpBootstrap>(gEnv->rank, 2);
    if (bootstrap->rank() == 0) id = bootstrap->createUniqueId();
  }
  MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
  if (gEnv->rank >= 2) {
    // This test needs only two ranks
    return;
  }

  bootstrap->initialize(id);

  ibCtx = std::make_shared<mscclpp::IbCtx>(ibDevName);
  qp = ibCtx->createQp(1024, 1, 8192, 0, 64);

  qpInfo[gEnv->rank] = qp->getInfo();
  bootstrap->allGather(qpInfo.data(), sizeof(mscclpp::IbQpInfo));
}

void IbPeerToPeerTest::registerBufferAndConnect(void* buf, size_t size) {
  bufSize = size;
  mr = ibCtx->registerMr(buf, size);
  mrInfo[gEnv->rank] = mr->getInfo();
  bootstrap->allGather(mrInfo.data(), sizeof(mscclpp::IbMrInfo));

  for (int i = 0; i < bootstrap->size(); ++i) {
    if (i == gEnv->rank) continue;
    qp->rtr(qpInfo[i]);
    qp->rts();
    break;
  }
  bootstrap->barrier();
}

void IbPeerToPeerTest::stageSend(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled) {
  const mscclpp::IbMrInfo& remoteMrInfo = mrInfo[(gEnv->rank == 1) ? 0 : 1];
  qp->stageSend(mr, remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled);
}

void IbPeerToPeerTest::stageAtomicAdd(uint64_t wrId, uint64_t dstOffset, uint64_t addVal, bool signaled) {
  const mscclpp::IbMrInfo& remoteMrInfo = mrInfo[(gEnv->rank == 1) ? 0 : 1];
  qp->stageAtomicAdd(mr, remoteMrInfo, wrId, dstOffset, addVal, signaled);
}

void IbPeerToPeerTest::stageSendWithImm(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset,
                                        bool signaled, unsigned int immData) {
  const mscclpp::IbMrInfo& remoteMrInfo = mrInfo[(gEnv->rank == 1) ? 0 : 1];
  qp->stageSendWithImm(mr, remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled, immData);
}

TEST_F(IbPeerToPeerTest, SimpleSendRecv) {
  if (gEnv->rank >= 2) {
    // This test needs only two ranks
    return;
  }

  mscclpp::Timer timeout(3);

  const int maxIter = 100000;
  const int nelem = 1;
  auto data = mscclpp::allocUniqueCuda<int>(nelem);

  registerBufferAndConnect(data.get(), sizeof(int) * nelem);

  if (gEnv->rank == 1) {
    mscclpp::Timer timer;
    for (int iter = 0; iter < maxIter; ++iter) {
      stageSend(sizeof(int) * nelem, 0, 0, 0, true);
      qp->postSend();
      bool waiting = true;
      int spin = 0;
      while (waiting) {
        int wcNum = qp->pollCq();
        ASSERT_GE(wcNum, 0);
        for (int i = 0; i < wcNum; ++i) {
          const ibv_wc* wc = qp->getWc(i);
          EXPECT_EQ(wc->status, IBV_WC_SUCCESS);
          waiting = false;
          break;
        }
        if (spin++ > 1000000) {
          FAIL() << "Polling is stuck.";
        }
      }
    }
    float us = (float)timer.elapsed();
    std::cout << "IbPeerToPeerTest.SimpleSendRecv: " << us / maxIter << " us/iter" << std::endl;
  }
  bootstrap->barrier();
}

__global__ void kernelMemoryConsistency(uint64_t* data, volatile uint64_t* curIter, volatile int* result,
                                        uint64_t nelem, uint64_t maxIter) {
  if (blockIdx.x != 0) return;

  constexpr int FlagWrong = 1;
  constexpr int FlagAbort = 2;

  volatile uint64_t* ptr = data;
  for (uint64_t iter = 1; iter < maxIter + 1; ++iter) {
    int err = 0;

    if (threadIdx.x == 0) {
      *curIter = iter;

      // Wait for the first element arrival (expect equal to iter). Expect that the first element is delivered in
      // a special way that guarantees all other elements are completely delivered.
      uint64_t spin = 0;
      while (ptr[0] != iter) {
        if (spin++ == 1000000) {
          // Assume the program is stuck. Set the abort flag and escape the loop.
          *result |= FlagAbort;
          err = 1;
          break;
        }
      }
    }
    __syncthreads();

    // Check results (expect equal to iter) in backward that is more likely to see the wrong result.
    for (size_t i = nelem - 1 + threadIdx.x; i >= blockDim.x; i -= blockDim.x) {
      if (data[i - blockDim.x] != iter) {
#if 1
        *result |= FlagWrong;
        err = 1;
        break;
#else
        // For debugging purposes: try waiting for the correct result.
        uint64_t spin = 0;
        while (ptr[i - blockDim.x] != iter) {
          if (spin++ == 1000000) {
            *result |= FlagAbort;
            err = 1;
            break;
          }
        }
        if (spin >= 1000000) {
          break;
        }
#endif
      }
    }
    __threadfence();
    __syncthreads();

    // Shuffle err
    for (int i = 16; i > 0; i /= 2) {
      err += __shfl_xor_sync(0xffffffff, err, i);
    }

    if (err > 0) {
      // Exit if any error is detected.
      return;
    }
  }
  if (threadIdx.x == 0) {
    *curIter = maxIter + 1;
  }
}

TEST_F(IbPeerToPeerTest, MemoryConsistency) {
  if (gEnv->rank >= 2) {
    // This test needs only two ranks
    return;
  }

  const uint64_t signalPeriod = 1024;
  const uint64_t maxIter = 10000;
  const uint64_t nelem = 65536 + 1;
  auto data = mscclpp::allocUniqueCuda<uint64_t>(nelem);

  registerBufferAndConnect(data.get(), sizeof(uint64_t) * nelem);

  uint64_t res = 0;
  uint64_t iter = 0;

  if (gEnv->rank == 0) {
    // Receiver
    auto curIter = mscclpp::makeUniqueCudaHost<uint64_t>(0);
    auto result = mscclpp::makeUniqueCudaHost<int>(0);

    volatile uint64_t* ptrCurIter = (volatile uint64_t*)curIter.get();
    volatile int* ptrResult = (volatile int*)result.get();

    ASSERT_EQ(*ptrCurIter, 0);
    ASSERT_EQ(*ptrResult, 0);

    kernelMemoryConsistency<<<1, 1024>>>(data.get(), ptrCurIter, ptrResult, nelem, maxIter);
    MSCCLPP_CUDATHROW(cudaGetLastError());

    for (iter = 1; iter < maxIter + 1; ++iter) {
      mscclpp::Timer timeout(5);

      while (*ptrCurIter != iter + 1) {
        res = *ptrResult;
        if (res != 0) break;
      }

      // Send the result to the sender
      res = *ptrResult;
      uint64_t tmp[2];
      tmp[0] = res;
      bootstrap->allGather(tmp, sizeof(uint64_t));

      if (res != 0) break;
    }

    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
  } else if (gEnv->rank == 1) {
    // Sender
    std::vector<uint64_t> hostBuffer(nelem, 0);

    for (iter = 1; iter < maxIter + 1; ++iter) {
      mscclpp::Timer timeout(5);

      // Set data
      for (uint64_t i = 0; i < nelem; i++) {
        hostBuffer[i] = iter;
      }
      mscclpp::memcpyCuda<uint64_t>(data.get(), hostBuffer.data(), nelem, cudaMemcpyHostToDevice);

      // Need to signal from time to time to empty the IB send queue
      bool signaled = (iter % signalPeriod == 0);

      // Send from the second element to the last
      stageSend(sizeof(uint64_t) * (nelem - 1), 0, sizeof(uint64_t), sizeof(uint64_t), signaled);
      qp->postSend();

#if 0
      // Send the first element using a normal send. This should occasionally see the wrong result.
      stageSend(sizeof(uint64_t), 0, 0, 0, false);
      qp->postSend();
#else
      // For reference: send the first element using AtomicAdd. This should see the correct result.
      stageAtomicAdd(0, 0, 1, false);
      qp->postSend();
#endif

      if (signaled) {
        int wcNum = qp->pollCq();
        while (wcNum == 0) {
          wcNum = qp->pollCq();
        }
        ASSERT_EQ(wcNum, 1);
        const ibv_wc* wc = qp->getWc(0);
        ASSERT_EQ(wc->status, IBV_WC_SUCCESS);
      }

      // Get the result from the receiver
      uint64_t tmp[2];
      bootstrap->allGather(tmp, sizeof(uint64_t));
      res = tmp[0];

      if (res != 0) break;
    }
  }

  if (res & 2) {
    FAIL() << "The receiver is stuck at iteration " << iter << ".";
  } else if (res != 0 && res != 1) {
    FAIL() << "Unknown error is detected at iteration " << iter << ". res =" << res;
  }

  EXPECT_EQ(res, 0);
}

TEST_F(IbPeerToPeerTest, SimpleAtomicAdd) {
  if (gEnv->rank >= 2) {
    // This test needs only two ranks
    return;
  }

  mscclpp::Timer timeout(3);

  const int maxIter = 100000;
  const int nelem = 1;
  auto data = mscclpp::allocUniqueCuda<int>(nelem);

  registerBufferAndConnect(data.get(), sizeof(int) * nelem);

  if (gEnv->rank == 1) {
    mscclpp::Timer timer;
    for (int iter = 0; iter < maxIter; ++iter) {
      stageAtomicAdd(0, 0, 1, true);
      qp->postSend();
      bool waiting = true;
      int spin = 0;
      while (waiting) {
        int wcNum = qp->pollCq();
        ASSERT_GE(wcNum, 0);
        for (int i = 0; i < wcNum; ++i) {
          const ibv_wc* wc = qp->getWc(i);
          EXPECT_EQ(wc->status, IBV_WC_SUCCESS);
          waiting = false;
          break;
        }
        if (spin++ > 1000000) {
          FAIL() << "Polling is stuck.";
        }
      }
    }
    float us = (float)timer.elapsed();
    std::cout << "IbPeerToPeerTest.SimpleAtomicAdd: " << us / maxIter << " us/iter" << std::endl;
  }
  bootstrap->barrier();
}