diff --git a/include/mscclpp/channel.hpp b/include/mscclpp/channel.hpp index 474244ce..76248ea9 100644 --- a/include/mscclpp/channel.hpp +++ b/include/mscclpp/channel.hpp @@ -94,6 +94,22 @@ struct DeviceChannel { put(dst, offset, src, offset, size); } + __forceinline__ __device__ void putDirect(void* dst, void* src, uint64_t dstOffset, uint64_t srcOffset, uint64_t size, + uint32_t threadId, uint32_t numThreads) { + // assume the memory is aligned to 8 bytes + uint64_t* srcAddr = (uint64_t*)((char*)src + srcOffset); + uint64_t* dstAddr = (uint64_t*)((char*)dst + dstOffset); + uint64_t ele; + size_t nElem = size % sizeof(uint64_t) ? (size + sizeof(uint64_t)) / sizeof(uint64_t) : size / sizeof(uint64_t); + for (size_t i = threadId; i < nElem; i += numThreads) { + // load to register first + ele = srcAddr[i]; + dstAddr[i] = ele; + } + } + + __forceinline__ __device__ void signalDirect() { epoch_.signalDirect(); } + __forceinline__ __device__ void signal() { epochIncrement(); fifo_.push(ChannelTrigger(TriggerFlag, 0, 0, 0, 0, 1, channelId_).value); @@ -212,6 +228,9 @@ struct SimpleDeviceChannel { SimpleDeviceChannel(DeviceChannel devChan, MemoryId dst, MemoryId src) : devChan_(devChan), dst_(dst), src_(src) {} + SimpleDeviceChannel(DeviceChannel devChan, void* dstPtr, void* srcPtr) + : devChan_(devChan), srcPtr_(srcPtr), dstPtr_(dstPtr) {} + SimpleDeviceChannel(const SimpleDeviceChannel& other) = default; SimpleDeviceChannel& operator=(SimpleDeviceChannel& other) = default; @@ -224,8 +243,14 @@ struct SimpleDeviceChannel { __forceinline__ __device__ void put(uint64_t offset, uint64_t size) { put(offset, offset, size); } + __forceinline__ __device__ void putDirect(uint64_t offset, uint64_t size, uint32_t threadId, uint32_t numThreads) { + devChan_.putDirect(dstPtr_, srcPtr_, offset, offset, size, threadId, numThreads); + } + __forceinline__ __device__ void signal() { devChan_.signal(); } + __forceinline__ __device__ void signalDirect() { devChan_.signalDirect(); } + __forceinline__ __device__ void putWithSignal(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) { devChan_.putWithSignal(dst_, dstOffset, src_, srcOffset, size); } @@ -251,6 +276,10 @@ struct SimpleDeviceChannel { DeviceChannel devChan_; MemoryId dst_; MemoryId src_; + + // these are used for direct copy + void* srcPtr_; + void* dstPtr_; }; } // namespace channel diff --git a/include/mscclpp/concurrency.hpp b/include/mscclpp/concurrency.hpp new file mode 100644 index 00000000..f734d171 --- /dev/null +++ b/include/mscclpp/concurrency.hpp @@ -0,0 +1,45 @@ +#ifndef MSCCLPP_CONCURRENCY_HPP_ +#define MSCCLPP_CONCURRENCY_HPP_ + +namespace mscclpp { +struct DeviceSyncer { + public: + DeviceSyncer() = default; + ~DeviceSyncer() = default; + +#ifdef __CUDACC__ + // Synchronize multiple thread blocks inside a kernel. Guarantee that all + // previous work of all threads in cooperating blocks is finished. + __forceinline__ __device__ void sync(int blockNum) { + int maxOldCnt = blockNum - 1; + __syncthreads(); + if (threadIdx.x == 0) { + int tmpIsAdd = isAdd_ ^ 1; + if (tmpIsAdd) { + if (atomicAdd(&count_, 1) == maxOldCnt) { + flag_ = 1; + } + while (!flag_) { + } + } else { + if (atomicSub(&count_, 1) == 1) { + flag_ = 0; + } + while (flag_) { + } + } + isAdd_ = tmpIsAdd; + } + // We need sync here because only a single thread is checking whether + // the flag is flipped. + __syncthreads(); + } +#endif + + private: + volatile int flag_; + int count_; + int isAdd_; +}; +} // namespace mscclpp +#endif // MSCCLPP_CONCURRENCY_HPP_ diff --git a/include/mscclpp/epoch.hpp b/include/mscclpp/epoch.hpp index 42908376..28fad52e 100644 --- a/include/mscclpp/epoch.hpp +++ b/include/mscclpp/epoch.hpp @@ -17,9 +17,9 @@ class BaseEpoch { private: std::shared_ptr connection_; RegisteredMemory localEpochIdsRegMem_; - NonblockingFuture remoteEpochIdsRegMem_; protected: + NonblockingFuture remoteEpochIdsRegMem_; std::unique_ptr> epochIds_; std::unique_ptr> expectedInboundEpochId_; @@ -56,9 +56,18 @@ class DeviceEpoch : BaseEpoch { } __forceinline__ __device__ void epochIncrement() { *(volatile uint64_t*)&(epochIds->outbound) += 1; } + + __forceinline__ __device__ void signalDirect() { + // This fence ensures that the writes from a preceding putDirect() are visible on the peer GPU before the + // incremented epoch id is visible. + __threadfence_system(); + epochIncrement(); + *(volatile uint64_t*)&(remoteEpochIds->inboundReplica) = epochIds->outbound; + } #endif // __CUDACC__ EpochIds* epochIds; + EpochIds* remoteEpochIds; uint64_t* expectedInboundEpochId; }; diff --git a/src/include/utils.hpp b/include/mscclpp/utils.hpp similarity index 64% rename from src/include/utils.hpp rename to include/mscclpp/utils.hpp index 536d1d29..7c2da2e1 100644 --- a/src/include/utils.hpp +++ b/include/mscclpp/utils.hpp @@ -1,9 +1,12 @@ #ifndef MSCCLPP_UTILS_HPP_ #define MSCCLPP_UTILS_HPP_ -#include +#include #include +#include +#include +#include namespace mscclpp { @@ -35,6 +38,18 @@ struct ScopedTimer { ~ScopedTimer() { timer.print(name); } }; +inline std::string getHostName(int maxlen, const char delim) { + std::string hostname(maxlen + 1, '\0'); + if (gethostname(const_cast(hostname.data()), maxlen) != 0) { + std::strncpy(const_cast(hostname.data()), "unknown", maxlen); + throw; + } + int i = 0; + while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen - 1)) i++; + hostname[i] = '\0'; + return hostname; +} + } // namespace mscclpp #endif // MSCCLPP_UTILS_HPP_ diff --git a/src/connection.cc b/src/connection.cc index 237c1305..68802384 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -1,13 +1,13 @@ #include "connection.hpp" #include +#include #include "checks_internal.hpp" #include "debug.h" #include "infiniband/verbs.h" #include "npkit/npkit.h" #include "registered_memory.hpp" -#include "utils.hpp" namespace mscclpp { diff --git a/src/epoch.cc b/src/epoch.cc index 0f73f9cb..3dd4cb84 100644 --- a/src/epoch.cc +++ b/src/epoch.cc @@ -13,6 +13,7 @@ MSCCLPP_API_CPP void DeviceEpoch::signal() { BaseEpoch::signal(); } MSCCLPP_API_CPP DeviceEpoch::DeviceHandle DeviceEpoch::deviceHandle() { DeviceEpoch::DeviceHandle device; + device.remoteEpochIds = reinterpret_cast(remoteEpochIdsRegMem_.get().data()); device.epochIds = epochIds_.get(); device.expectedInboundEpochId = expectedInboundEpochId_.get(); return device; diff --git a/src/proxy.cc b/src/proxy.cc index 8a066279..cfffb4be 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -1,11 +1,11 @@ #include #include #include +#include #include #include "api.h" #include "utils.h" -#include "utils.hpp" namespace mscclpp { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 7d3959f7..54fe31b6 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -22,3 +22,6 @@ add_executable(unit_tests) target_link_libraries(unit_tests GTest::gtest_main GTest::gmock_main mscclpp CUDA::cudart CUDA::cuda_driver) add_subdirectory(unit) # This adds the sources to the mscclpp target gtest_discover_tests(unit_tests DISCOVERY_MODE PRE_TEST) + +# Msccclpp_test +add_subdirectory(mscclpp-test) diff --git a/test/common.cu b/test/common.cu deleted file mode 100644 index 35d82c74..00000000 --- a/test/common.cu +++ /dev/null @@ -1,692 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "common.h" -#include "cuda.h" -#include "mscclpp.h" - -#include -#include -#include -#include -#include -#include - -#include -#include - -#define NUM_BLOCKS 32 - -int is_main_proc = 0; -thread_local int is_main_thread = 0; - -namespace { -class timer -{ - std::uint64_t t0; - -public: - timer(); - double elapsed() const; - double reset(); -}; - -std::uint64_t now() -{ - using clock = std::chrono::steady_clock; - return std::chrono::duration_cast(clock::now().time_since_epoch()).count(); -} - -// Command line parameter defaults -size_t minBytes = 32 * 1024 * 1024; -size_t maxBytes = 32 * 1024 * 1024; -size_t stepBytes = 1 * 1024 * 1024; -size_t stepFactor = 1; -int datacheck = 1; -int warmup_iters = 10; -int iters = 100; -int timeout = 0; -int report_cputime = 0; -// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) -int average = 1; -int kernel_num = 0; -int cudaGraphLaunches = 15; - -double parsesize(const char* value) -{ - long long int units; - double size; - char size_lit; - - int count = sscanf(value, "%lf %1s", &size, &size_lit); - - switch (count) { - case 2: - switch (size_lit) { - case 'G': - case 'g': - units = 1024 * 1024 * 1024; - break; - case 'M': - case 'm': - units = 1024 * 1024; - break; - case 'K': - case 'k': - units = 1024; - break; - default: - return -1.0; - }; - break; - case 1: - units = 1; - break; - default: - return -1.0; - } - - return size * units; -} - -inline testResult_t Barrier(struct testArgs* args) -{ - int tmp[16]; - // A simple barrier - MSCCLPPCHECK(mscclppBootstrapAllGather(args->comm, tmp, sizeof(int))); - return testSuccess; -} -} // namespace - -timer::timer() -{ - t0 = now(); -} - -double timer::elapsed() const -{ - std::uint64_t t1 = now(); - return 1.e-9 * (t1 - t0); -} - -double timer::reset() -{ - std::uint64_t t1 = now(); - double ans = 1.e-9 * (t1 - t0); - t0 = t1; - return ans; -} - -testResult_t AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvBytes, void** expected, - size_t nbytes) -{ - CUDACHECK(cudaMalloc(sendbuff, nbytes)); - CUDACHECK(cudaMalloc(recvbuff, nbytes)); - if (datacheck) - CUDACHECK(cudaMalloc(expected, recvBytes)); - return testSuccess; -} - -testResult_t startColl(struct testArgs* args, int in_place, int iter) -{ - size_t count = args->nbytes; - - // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange - size_t totalnbytes = max(args->sendBytes, args->expectedBytes); - size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; - size_t shift = totalnbytes * (iter % steps); - - int rank = args->proc; - char* recvBuff = ((char*)args->recvbuff) + shift; - char* sendBuff = ((char*)args->sendbuff) + shift; - - TESTCHECK(args->collTest->runColl((void*)(in_place ? recvBuff + args->sendInplaceOffset * rank : sendBuff), - (void*)(in_place ? recvBuff + args->recvInplaceOffset * rank : recvBuff), - args->nranksPerNode, count, args->comm, args->stream, args->kernel_num)); - return testSuccess; -} - -testResult_t testStreamSynchronize(cudaStream_t stream) -{ - cudaError_t cudaErr; - timer tim; - - while (true) { - cudaErr = cudaStreamQuery(stream); - if (cudaErr == cudaSuccess) { - break; - } - - if (cudaErr != cudaErrorNotReady) - CUDACHECK(cudaErr); - - double delta = tim.elapsed(); - if (delta > timeout && timeout > 0) { - char hostname[1024]; - getHostName(hostname, 1024); - printf("%s: Test timeout (%ds) %s:%d\n", hostname, timeout, __FILE__, __LINE__); - return testTimeout; - } - - // We might want to let other threads (including MSCCLPP threads) use the CPU. - sched_yield(); - } - return testSuccess; -} - -testResult_t completeColl(struct testArgs* args) -{ - TESTCHECK(testStreamSynchronize(args->stream)); - return testSuccess; -} - -// Inter process barrier+allreduce. The quality of the return value -// for average=0 is just value itself. -// Inter process barrier+allreduce. The quality of the return value -// for average=0 is just value itself. -template void Allreduce(struct testArgs* args, T* value, int average) -{ - T accumulator = *value; - -#ifdef MSCCLPP_USE_MPI_FOR_TESTS - if (average != 0) { - static_assert(std::is_same::value || std::is_same::value, - "Allreduce only for T in {long long, double}"); - MPI_Datatype ty = std::is_same::value ? MPI_LONG_LONG - : std::is_same::value ? MPI_DOUBLE - : MPI_Datatype(); - MPI_Op op = average == 1 ? MPI_SUM - : average == 2 ? MPI_MIN - : average == 3 ? MPI_MAX - : average == 4 ? MPI_SUM - : MPI_Op(); - MPI_Allreduce(MPI_IN_PLACE, (void*)&accumulator, 1, ty, op, MPI_COMM_WORLD); - } -#endif - - if (average == 1) - accumulator /= args->totalProcs; - *value = accumulator; -} - -testResult_t CheckData(struct testArgs* args, int64_t* wrongElts) -{ - size_t count = args->expectedBytes / sizeof(int); - - int* dataHostRecv = new int[count]; - int* dataHostExpected = new int[count]; - CUDACHECK(cudaMemcpy(dataHostRecv, args->recvbuff, args->expectedBytes, cudaMemcpyDeviceToHost)); - CUDACHECK(cudaMemcpy(dataHostExpected, args->expected, args->expectedBytes, cudaMemcpyDeviceToHost)); - - for (size_t i = 0; i < count; i++) { - if (dataHostRecv[i] != dataHostExpected[i]) { - // PRINT("Error: dataHostRecv[%ld] = %d, dataHostExpected[%ld] = %d\n", i, dataHostRecv[i], i, - // dataHostExpected[i]); - *wrongElts += 1; - } - } - if (args->reportErrors && *wrongElts) { - (args->error)++; - } - return testSuccess; -} - -testResult_t BenchTime(struct testArgs* args, int in_place) -{ - size_t count = args->nbytes; - - TESTCHECK(args->collTest->initData(args, in_place)); - // Sync - TESTCHECK(startColl(args, in_place, 0)); - TESTCHECK(completeColl(args)); - - TESTCHECK(Barrier(args)); - - // Performance Benchmark - cudaGraph_t graph; - cudaGraphExec_t graphExec; - CUDACHECK(cudaStreamBeginCapture(args->stream, cudaStreamCaptureModeGlobal)); - timer tim; - for (int iter = 0; iter < iters; iter++) { - TESTCHECK(startColl(args, in_place, iter)); - } - CUDACHECK(cudaStreamEndCapture(args->stream, &graph)); - CUDACHECK(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); - - // Launch the graph - TESTCHECK(Barrier(args)); - tim.reset(); - for (int l = 0; l < cudaGraphLaunches; ++l) { - CUDACHECK(cudaGraphLaunch(graphExec, args->stream)); - } - - double cputimeSec = tim.elapsed() / (iters); - TESTCHECK(completeColl(args)); - - double deltaSec = tim.elapsed(); - deltaSec = deltaSec / (iters) / (cudaGraphLaunches); - Allreduce(args, &deltaSec, average); - - CUDACHECK(cudaGraphExecDestroy(graphExec)); - CUDACHECK(cudaGraphDestroy(graph)); - - double algBw, busBw; - args->collTest->getBw(count, 1, deltaSec, &algBw, &busBw, args->totalProcs); - TESTCHECK(Barrier(args)); - - int64_t wrongElts = 0; - if (datacheck) { - // Initialize sendbuffs, recvbuffs and expected - TESTCHECK(args->collTest->initData(args, in_place)); - // Begin cuda graph capture for data check - CUDACHECK(cudaStreamBeginCapture(args->stream, cudaStreamCaptureModeGlobal)); - // test validation in single itertion, should ideally be included into the multi-iteration run - TESTCHECK(startColl(args, in_place, 0)); - // End cuda graph capture - CUDACHECK(cudaStreamEndCapture(args->stream, &graph)); - // Instantiate cuda graph - CUDACHECK(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); - // Launch cuda graph - CUDACHECK(cudaGraphLaunch(graphExec, args->stream)); - - TESTCHECK(completeColl(args)); - - // destroy cuda graph - CUDACHECK(cudaGraphExecDestroy(graphExec)); - CUDACHECK(cudaGraphDestroy(graph)); - - TESTCHECK(CheckData(args, &wrongElts)); - - // aggregate delta from all threads and procs - long long wrongElts1 = wrongElts; - Allreduce(args, &wrongElts1, /*sum*/ 4); - wrongElts = wrongElts1; - } - - double timeUsec = (report_cputime ? cputimeSec : deltaSec) * 1.0E6; - char timeStr[100]; - if (timeUsec >= 10000.0) { - sprintf(timeStr, "%7.0f", timeUsec); - } else if (timeUsec >= 100.0) { - sprintf(timeStr, "%7.1f", timeUsec); - } else { - sprintf(timeStr, "%7.2f", timeUsec); - } - if (!in_place) { - PRINT(" "); - } - if (args->reportErrors) { - PRINT(" %7s %6.2f %6.2f %5g", timeStr, algBw, busBw, (double)wrongElts); - } else { - PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); - } - - args->bw += busBw; - args->bw_count++; - return testSuccess; -} - -testResult_t setupArgsAndInit(size_t size, struct testArgs* args) -{ - int nranks = args->totalProcs; - size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset; - - // TODO: support more data types - int typeSize = sizeof(int); - count = size / typeSize; - args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, &sendInplaceOffset, &recvInplaceOffset, - (size_t)count, (size_t)nranks); - - args->nbytes = paramCount * typeSize; - args->sendBytes = sendCount * typeSize; - args->expectedBytes = recvCount * typeSize; - args->sendInplaceOffset = sendInplaceOffset * typeSize; - args->recvInplaceOffset = recvInplaceOffset * typeSize; - - return args->collTest->initColl(); -} - -testResult_t TimeTest(struct testArgs* args) -{ - // Sync to avoid first-call timeout - TESTCHECK(Barrier(args)); - - // Warm-up for large size - TESTCHECK(setupArgsAndInit(args->maxbytes, args)); - TESTCHECK(args->collTest->initData(args, 1)); - for (int iter = 0; iter < warmup_iters; iter++) { - TESTCHECK(startColl(args, 1, iter)); - } - TESTCHECK(completeColl(args)); - - // Warm-up for small size - TESTCHECK(setupArgsAndInit(args->minbytes, args)); - for (int iter = 0; iter < warmup_iters; iter++) { - TESTCHECK(startColl(args, 1, iter)); - } - TESTCHECK(completeColl(args)); - - PRINT("#\n"); - PRINT("# %10s %12s in-place out-of-place \n", "", ""); - PRINT("# %10s %12s %7s %6s %6s %6s %7s %6s %6s %6s\n", "size", "count", "time", "algbw", "busbw", "#wrong", - "time", "algbw", "busbw", "#wrong"); - PRINT("# %10s %12s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "(us)", "(GB/s)", "(GB/s)", "", - "(us)", "(GB/s)", "(GB/s)", ""); - // Benchmark - for (size_t size = args->minbytes; size <= args->maxbytes; - size = ((args->stepfactor > 1) ? size * args->stepfactor : size + args->stepbytes)) { - TESTCHECK(setupArgsAndInit(size, args)); - PRINT("%12li %12li", max(args->sendBytes, args->expectedBytes), args->nbytes / sizeof(int)); - TESTCHECK(BenchTime(args, args->in_place)); - PRINT("\n"); - } - return testSuccess; -} - -testResult_t setupMscclppConnections(int rank, int worldSize, int ranksPerNode, mscclppComm_t comm, void* dataDst, - size_t dataSize) -{ - int thisNode = rank / ranksPerNode; - int localRank = rank % ranksPerNode; - std::string ibDevStr = "mlx5_ib" + std::to_string(localRank); - - for (int r = 0; r < worldSize; ++r) { - if (r == rank) - continue; - mscclppTransport_t transportType; - const char* ibDev = ibDevStr.c_str(); - if (r / ranksPerNode == thisNode) { - ibDev = NULL; - transportType = mscclppTransportP2P; - } else { - transportType = mscclppTransportIB; - } - // Connect with all other ranks - MSCCLPPCHECK(mscclppConnect(comm, r, 0, dataDst, dataSize, transportType, ibDev)); - } - - MSCCLPPCHECK(mscclppConnectionSetup(comm)); - - return testSuccess; -} - -testResult_t runTests(struct testArgs* args) -{ - PRINT("# Setting up the connection in MSCCL++\n"); - if (mscclppTestEngine.setupMscclppConnections != nullptr) { - TESTCHECK(mscclppTestEngine.setupMscclppConnections(args)); - } else { - TESTCHECK(setupMscclppConnections(args->proc, args->totalProcs, args->nranksPerNode, args->comm, args->recvbuff, - args->maxbytes)); - } - PRINT("# Launching MSCCL++ proxy threads\n"); - MSCCLPPCHECK(mscclppProxyLaunch(args->comm)); - TESTCHECK(mscclppTestEngine.runTest(args)); - PRINT("Stopping MSCCL++ proxy threads\n"); - MSCCLPPCHECK(mscclppProxyStop(args->comm)); - if (mscclppTestEngine.teardownMscclppConnections != nullptr) { - TESTCHECK(mscclppTestEngine.teardownMscclppConnections()); - } - return testSuccess; -} - -testResult_t run(); // Main function - -int main(int argc, char* argv[]) -{ - // Make sure everyline is flushed so that we see the progress of the test - setlinebuf(stdout); - - // Parse args - double parsed; - int longindex; - static struct option longopts[] = {{"minbytes", required_argument, 0, 'b'}, - {"maxbytes", required_argument, 0, 'e'}, - {"stepbytes", required_argument, 0, 'i'}, - {"stepfactor", required_argument, 0, 'f'}, - {"iters", required_argument, 0, 'n'}, - {"warmup_iters", required_argument, 0, 'w'}, - {"check", required_argument, 0, 'c'}, - {"timeout", required_argument, 0, 'T'}, - {"cudagraph", required_argument, 0, 'G'}, - {"report_cputime", required_argument, 0, 'C'}, - {"average", required_argument, 0, 'a'}, - {"kernel_num", required_argument, 0, 'k'}, - {"help", no_argument, 0, 'h'}, - {}}; - - while (1) { - int c; - c = getopt_long(argc, argv, "b:e:i:f:n:w:c:T:G:C:a:P:k:h:", longopts, &longindex); - - if (c == -1) - break; - - switch (c) { - case 'b': - parsed = parsesize(optarg); - if (parsed < 0) { - fprintf(stderr, "invalid size specified for 'minbytes'\n"); - return -1; - } - minBytes = (size_t)parsed; - break; - case 'e': - parsed = parsesize(optarg); - if (parsed < 0) { - fprintf(stderr, "invalid size specified for 'maxbytes'\n"); - return -1; - } - maxBytes = (size_t)parsed; - break; - case 'i': - stepBytes = strtol(optarg, NULL, 0); - break; - case 'f': - stepFactor = strtol(optarg, NULL, 0); - break; - case 'n': - iters = (int)strtol(optarg, NULL, 0); - break; - case 'w': - warmup_iters = (int)strtol(optarg, NULL, 0); - break; - case 'c': - datacheck = (int)strtol(optarg, NULL, 0); - break; - case 'T': - timeout = strtol(optarg, NULL, 0); - break; - case 'G': - cudaGraphLaunches = strtol(optarg, NULL, 0); - if (cudaGraphLaunches <= 0) { - fprintf(stderr, "invalid number for 'cudaGraphLaunches'\n"); - return -1; - } - break; - case 'C': - report_cputime = strtol(optarg, NULL, 0); - break; - case 'a': - average = (int)strtol(optarg, NULL, 0); - break; - case 'k': - kernel_num = (int)strtol(optarg, NULL, 0); - break; - case 'h': - default: - if (c != 'h') - printf("invalid option '%c'\n", c); - printf("USAGE: %s \n\t" - "[-b,--minbytes ] \n\t" - "[-e,--maxbytes ] \n\t" - "[-i,--stepbytes ] \n\t" - "[-f,--stepfactor ] \n\t" - "[-n,--iters ] \n\t" - "[-w,--warmup_iters ] \n\t" - "[-c,--check <0/1>] \n\t" - "[-T,--timeout