From 22a977e730148080f0774071c525dce69c506406 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 30 Mar 2023 06:29:38 +0000 Subject: [PATCH 01/23] init --- Makefile | 4 +- tests/allgather_test3.cu | 93 +++++++++++++++++ tests/common.cu | 212 +++++++++++++++++++++++++++++++++++++++ tests/common.h | 152 +++++++++++++++++++++++++++- 4 files changed, 457 insertions(+), 4 deletions(-) create mode 100644 tests/allgather_test3.cu create mode 100644 tests/common.cu diff --git a/Makefile b/Makefile index 8a2fd327..2793b7bb 100644 --- a/Makefile +++ b/Makefile @@ -129,7 +129,7 @@ LIBSONAME := $(LIBNAME).$(MSCCLPP_MAJOR) LIBTARGET := $(BUILDDIR)/$(LIBDIR)/$(LIBNAME).$(MSCCLPP_MAJOR).$(MSCCLPP_MINOR).$(MSCCLPP_PATCH) TESTSDIR := tests -TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test.cu) +TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test.cu common.cu) TESTSOBJS := $(patsubst %.cc,%.o,$(TESTSSRCS)) $(patsubst %.cu,%.o,$(TESTSSRCS)) TESTSOBJTARGETS := $(TESTSOBJS:%=$(BUILDDIR)/$(OBJDIR)/%) TESTSBINS := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(TESTSOBJS)) @@ -179,7 +179,7 @@ $(BUILDDIR)/$(OBJDIR)/$(TESTSDIR)/%.o: $(TESTSDIR)/%.cc $(INCTARGETS) # Compile .cu tests $(BUILDDIR)/$(OBJDIR)/$(TESTSDIR)/%.o: $(TESTSDIR)/%.cu $(INCTARGETS) @mkdir -p $(@D) - $(NVCC) -o $@ -I$(BUILDDIR)/$(INCDIR) $(MPI_INC) $(NVCUFLAGS) -c $< $(MPI_MACRO) + $(NVCC) -o $@ -I$(BUILDDIR)/$(INCDIR) $(MPI_INC) $(NVCUFLAGS) $(INCLUDE) -c $< $(MPI_MACRO) # Test bins $(BUILDDIR)/$(BINDIR)/%: $(BUILDDIR)/$(OBJDIR)/%.o $(LIBTARGET) diff --git a/tests/allgather_test3.cu b/tests/allgather_test3.cu new file mode 100644 index 00000000..ce75864b --- /dev/null +++ b/tests/allgather_test3.cu @@ -0,0 +1,93 @@ +/************************************************************************* + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "common.h" + +#define ALIGN 4 + +void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + size_t base = (count/(ALIGN*nranks))*ALIGN; + *sendcount = base; + *recvcount = base*nranks; + *sendInplaceOffset = base; + *recvInplaceOffset = 0; + *paramcount = base; +} + +testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + CUDACHECK(cudaSetDevice(args->gpus[i])); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); + for (int j=0; jexpected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0)); + } + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(nranks - 1))/((double)nranks); + *busBw = baseBw * factor; +} + +testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream)); + return testSuccess; +} + +struct testColl allGatherTest = { + "AllGather", + AllGatherGetCollByteCount, + AllGatherInitData, + AllGatherGetBw, + AllGatherRunColl +}; + +void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + AllGatherGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t AllGatherRunTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &allGatherTest; + ncclDataType_t *run_types; + const char **run_typenames; + int type_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + for (int i=0; i +#include +#include + +#include +#include +#include + +int is_main_proc = 0; +thread_local int is_main_thread = 0; + +// Command line parameter defaults +static int nThreads = 1; +static int nGpus = 1; +static size_t minBytes = 32*1024*1024; +static size_t maxBytes = 32*1024*1024; +static size_t stepBytes = 1*1024*1024; +static size_t stepFactor = 1; +static int datacheck = 1; +static int warmup_iters = 5; +static int iters = 20; +static int timeout = 0; +static int report_cputime = 0; +// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) +static int average = 1; + +#define NUM_BLOCKS 32 + +static double parsesize(const char *value) { + long long int units; + double size; + char size_lit; + + int count = sscanf(value, "%lf %1s", &size, &size_lit); + + switch (count) { + case 2: + switch (size_lit) { + case 'G': + case 'g': + units = 1024*1024*1024; + break; + case 'M': + case 'm': + units = 1024*1024; + break; + case 'K': + case 'k': + units = 1024; + break; + default: + return -1.0; + }; + break; + case 1: + units = 1; + break; + default: + return -1.0; + } + + return size * units; +} + +testResult_t run(); // Main function + +int main(int argc, char* argv[]) { + // Make sure everyline is flushed so that we see the progress of the test + setlinebuf(stdout); + + // Parse args + double parsed; + int longindex; + static struct option longopts[] = { + {"nthreads", required_argument, 0, 't'}, + {"ngpus", required_argument, 0, 'g'}, + {"minbytes", required_argument, 0, 'b'}, + {"maxbytes", required_argument, 0, 'e'}, + {"stepbytes", required_argument, 0, 'i'}, + {"stepfactor", required_argument, 0, 'f'}, + {"iters", required_argument, 0, 'n'}, + {"agg_iters", required_argument, 0, 'm'}, + {"warmup_iters", required_argument, 0, 'w'}, + {"check", required_argument, 0, 'c'}, + {"timeout", required_argument, 0, 'T'}, + {"report_cputime", required_argument, 0, 'C'}, + {"average", required_argument, 0, 'a'}, + {"help", no_argument, 0, 'h'}, + {} + }; + + while(1) { + int c; + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:w:c:o:T:h:C:a:", longopts, &longindex); + + if (c == -1) + break; + + switch(c) { + case 't': + nThreads = strtol(optarg, NULL, 0); + break; + case 'g': + nGpus = strtol(optarg, NULL, 0); + break; + case 'b': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'minbytes'\n"); + return -1; + } + minBytes = (size_t)parsed; + break; + case 'e': + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'maxbytes'\n"); + return -1; + } + maxBytes = (size_t)parsed; + break; + case 'i': + stepBytes = strtol(optarg, NULL, 0); + break; + case 'f': + stepFactor = strtol(optarg, NULL, 0); + break; + case 'n': + iters = (int)strtol(optarg, NULL, 0); + break; + case 'w': + warmup_iters = (int)strtol(optarg, NULL, 0); + break; + case 'c': + datacheck = (int)strtol(optarg, NULL, 0); + break; + case 'T': + timeout = strtol(optarg, NULL, 0); + break; + case 'C': + report_cputime = strtol(optarg, NULL, 0); + break; + case 'a': + average = (int)strtol(optarg, NULL, 0); + break; + case 'h': + default: + if (c != 'h') printf("invalid option '%c'\n", c); + printf("USAGE: %s \n\t" + "[-t,--nthreads ] \n\t" + "[-g,--ngpus ] \n\t" + "[-b,--minbytes ] \n\t" + "[-e,--maxbytes ] \n\t" + "[-i,--stepbytes ] \n\t" + "[-f,--stepfactor ] \n\t" + "[-n,--iters ] \n\t" + "[-w,--warmup_iters ] \n\t" + "[-c,--check <0/1>] \n\t" + "[-T,--timeout