mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-05 22:22:49 +00:00
Support uint8 data type for Allreduce (#736)
Support uint8 data type for Allreduce. Current limitation: uint8 is not supported for NVLS. Performance results with RCCL-test with MSCCLPP on MI300X: \# out-of-place in-place \# size count type redop root time algbw busbw #wrong time algbw busbw #wrong \# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) 1024 | 512 | half | sum | -1 | 5.39 | 0.19 | 0.33 | 0 | 5.45 | 0.19 | 0.33 | 0 -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- 2048 | 1024 | half | sum | -1 | 5.53 | 0.37 | 0.65 | 0 | 5.63 | 0.36 | 0.64 | 0 4096 | 2048 | half | sum | -1 | 5.55 | 0.74 | 1.29 | 0 | 5.56 | 0.74 | 1.29 | 0 8192 | 4096 | half | sum | -1 | 5.8 | 1.41 | 2.47 | 0 | 5.84 | 1.4 | 2.46 | 0 16384 | 8192 | half | sum | -1 | 6.57 | 2.49 | 4.36 | 0 | 6.56 | 2.5 | 4.37 | 0 32768 | 16384 | half | sum | -1 | 8.02 | 4.09 | 7.15 | 0 | 8.06 | 4.07 | 7.11 | 0 65536 | 32768 | half | sum | -1 | 8.77 | 7.47 | 13.07 | 0 | 8.82 | 7.43 | 13 | 0 131072 | 65536 | half | sum | -1 | 9.61 | 13.64 | 23.87 | 0 | 9.78 | 13.4 | 23.45 | 0 262144 | 131072 | half | sum | -1 | 11.68 | 22.44 | 39.27 | 0 | 12.1 | 21.67 | 37.93 | 0 524288 | 262144 | half | sum | -1 | 13.77 | 38.08 | 66.64 | 0 | 13.87 | 37.79 | 66.13 | 0 1048576 | 524288 | half | sum | -1 | 19.11 | 54.87 | 96.03 | 0 | 19.27 | 54.42 | 95.24 | 0 2097152 | 1048576 | half | sum | -1 | 24.1 | 87 | 152.26 | 0 | 24.24 | 86.52 | 151.41 | 0 4194304 | 2097152 | half | sum | -1 | 37.16 | 112.87 | 197.52 | 0 | 37.44 | 112.03 | 196.06 | 0 8388608 | 4194304 | half | sum | -1 | 61.53 | 136.33 | 238.58 | 0 | 61.68 | 135.99 | 237.99 | 0 16777216 | 8388608 | half | sum | -1 | 108.8 | 154.22 | 269.88 | 0 | 109.2 | 153.6 | 268.79 | 0 33554432 | 16777216 | half | sum | -1 | 197.8 | 169.68 | 296.94 | 0 | 198.6 | 168.92 | 295.61 | 0 67108864 | 33554432 | half | sum | -1 | 384.6 | 174.51 | 305.39 | 0 | 385.1 | 174.27 | 304.98 | 0 134217728 | 67108864 | half | sum | -1 | 754.1 | 177.99 | 311.48 | 0 | 754.9 | 177.78 | 311.12 | 0 268435456 | 134217728 | half | sum | -1 | 1491.8 | 179.94 | 314.89 | 0 | 1493.2 | 179.77 | 314.6 | 0 536870912 | 268435456 | half | sum | -1 | 2979.6 | 180.18 | 315.31 | 0 | 2983.9 | 179.92 | 314.87 | 0 \# out-of-place in-place \# size count type redop root time algbw busbw #wrong time algbw busbw #wrong \# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) 1024 | 1024 | fp8_e4m3 | sum | -1 | 5.4 | 0.19 | 0.33 | 0 | 5.45 | 0.19 | 0.33 | 0 -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- 2048 | 2048 | fp8_e4m3 | sum | -1 | 5.5 | 0.37 | 0.65 | 0 | 5.6 | 0.37 | 0.64 | 0 4096 | 4096 | fp8_e4m3 | sum | -1 | 5.61 | 0.73 | 1.28 | 0 | 5.68 | 0.72 | 1.26 | 0 8192 | 8192 | fp8_e4m3 | sum | -1 | 5.96 | 1.38 | 2.41 | 0 | 5.98 | 1.37 | 2.4 | 0 16384 | 16384 | fp8_e4m3 | sum | -1 | 6.49 | 2.52 | 4.42 | 0 | 6.58 | 2.49 | 4.36 | 0 32768 | 32768 | fp8_e4m3 | sum | -1 | 8.09 | 4.05 | 7.09 | 0 | 8.15 | 4.02 | 7.03 | 0 65536 | 65536 | fp8_e4m3 | sum | -1 | 8.58 | 7.64 | 13.37 | 0 | 8.7 | 7.53 | 13.18 | 0 131072 | 131072 | fp8_e4m3 | sum | -1 | 9.44 | 13.88 | 24.29 | 0 | 9.62 | 13.63 | 23.85 | 0 262144 | 262144 | fp8_e4m3 | sum | -1 | 10.12 | 25.9 | 45.32 | 0 | 10.37 | 25.27 | 44.22 | 0 524288 | 524288 | fp8_e4m3 | sum | -1 | 13.73 | 38.19 | 66.82 | 0 | 13.89 | 37.74 | 66.04 | 0 1048576 | 1048576 | fp8_e4m3 | sum | -1 | 18.66 | 56.2 | 98.34 | 0 | 18.92 | 55.41 | 96.97 | 0 2097152 | 2097152 | fp8_e4m3 | sum | -1 | 24.54 | 85.46 | 149.56 | 0 | 24.63 | 85.16 | 149.03 | 0 4194304 | 4194304 | fp8_e4m3 | sum | -1 | 37.79 | 110.98 | 194.21 | 0 | 38.05 | 110.22 | 192.88 | 0 8388608 | 8388608 | fp8_e4m3 | sum | -1 | 62.22 | 134.82 | 235.94 | 0 | 62.63 | 133.94 | 234.4 | 0 16777216 | 16777216 | fp8_e4m3 | sum | -1 | 109.9 | 152.62 | 267.09 | 0 | 110.4 | 151.9 | 265.83 | 0 33554432 | 33554432 | fp8_e4m3 | sum | -1 | 201.1 | 166.82 | 291.94 | 0 | 202.3 | 165.84 | 290.22 | 0 67108864 | 67108864 | fp8_e4m3 | sum | -1 | 390 | 172.06 | 301.11 | 0 | 390.2 | 171.99 | 300.99 | 0 134217728 | 134217728 | fp8_e4m3 | sum | -1 | 763.9 | 175.7 | 307.47 | 0 | 764.2 | 175.62 | 307.34 | 0 268435456 | 268435456 | fp8_e4m3 | sum | -1 | 1509.5 | 177.83 | 311.2 | 0 | 1510.1 | 177.76 | 311.08 | 0 536870912 | 536870912 | fp8_e4m3 | sum | -1 | 3010.2 | 178.35 | 312.11 | 0 | 3014.2 | 178.11 | 311.7 | 0 \# out-of-place in-place \# size count type redop root time algbw busbw #wrong time algbw busbw #wrong \# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) 1024 | 1024 | fp8_e5m2 | sum | -1 | 5.41 | 0.19 | 0.33 | 0 | 5.44 | 0.19 | 0.33 | 0 -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- 2048 | 2048 | fp8_e5m2 | sum | -1 | 5.5 | 0.37 | 0.65 | 0 | 5.67 | 0.36 | 0.63 | 0 4096 | 4096 | fp8_e5m2 | sum | -1 | 5.61 | 0.73 | 1.28 | 0 | 5.69 | 0.72 | 1.26 | 0 8192 | 8192 | fp8_e5m2 | sum | -1 | 5.96 | 1.37 | 2.4 | 0 | 6 | 1.36 | 2.39 | 0 16384 | 16384 | fp8_e5m2 | sum | -1 | 6.63 | 2.47 | 4.32 | 0 | 6.59 | 2.49 | 4.35 | 0 32768 | 32768 | fp8_e5m2 | sum | -1 | 8.07 | 4.06 | 7.1 | 0 | 8.16 | 4.02 | 7.03 | 0 65536 | 65536 | fp8_e5m2 | sum | -1 | 8.62 | 7.61 | 13.31 | 0 | 8.73 | 7.51 | 13.14 | 0 131072 | 131072 | fp8_e5m2 | sum | -1 | 9.43 | 13.9 | 24.33 | 0 | 9.6 | 13.66 | 23.9 | 0 262144 | 262144 | fp8_e5m2 | sum | -1 | 10.11 | 25.94 | 45.39 | 0 | 10.38 | 25.26 | 44.21 | 0 524288 | 524288 | fp8_e5m2 | sum | -1 | 13.73 | 38.19 | 66.84 | 0 | 13.87 | 37.79 | 66.13 | 0 1048576 | 1048576 | fp8_e5m2 | sum | -1 | 18.65 | 56.22 | 98.39 | 0 | 18.93 | 55.38 | 96.92 | 0 2097152 | 2097152 | fp8_e5m2 | sum | -1 | 24.54 | 85.47 | 149.57 | 0 | 24.63 | 85.16 | 149.03 | 0 4194304 | 4194304 | fp8_e5m2 | sum | -1 | 37.84 | 110.83 | 193.96 | 0 | 38.01 | 110.36 | 193.12 | 0 8388608 | 8388608 | fp8_e5m2 | sum | -1 | 62.32 | 134.61 | 235.58 | 0 | 62.55 | 134.12 | 234.71 | 0 16777216 | 16777216 | fp8_e5m2 | sum | -1 | 110 | 152.58 | 267.01 | 0 | 110.3 | 152.12 | 266.21 | 0 33554432 | 33554432 | fp8_e5m2 | sum | -1 | 201.1 | 166.9 | 292.07 | 0 | 201.8 | 166.26 | 290.96 | 0 67108864 | 67108864 | fp8_e5m2 | sum | -1 | 390 | 172.07 | 301.12 | 0 | 390.5 | 171.87 | 300.78 | 0 134217728 | 134217728 | fp8_e5m2 | sum | -1 | 763.9 | 175.69 | 307.46 | 0 | 764.5 | 175.56 | 307.23 | 0 268435456 | 268435456 | fp8_e5m2 | sum | -1 | 1509.4 | 177.84 | 311.22 | 0 | 1509.8 | 177.8 | 311.14 | 0 536870912 | 536870912 | fp8_e5m2 | sum | -1 | 3013 | 178.18 | 311.82 | 0 | 3018 | 177.89 | 311.31 | 0 \# out-of-place in-place \# size count type redop root time algbw busbw #wrong time algbw busbw #wrong \# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) 1024 | 1024 | uint8 | sum | -1 | 5.46 | 0.19 | 0.33 | 0 | 5.46 | 0.19 | 0.33 | 0 -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- 2048 | 2048 | uint8 | sum | -1 | 5.54 | 0.37 | 0.65 | 0 | 5.63 | 0.36 | 0.64 | 0 4096 | 4096 | uint8 | sum | -1 | 5.61 | 0.73 | 1.28 | 0 | 5.63 | 0.73 | 1.27 | 0 8192 | 8192 | uint8 | sum | -1 | 5.9 | 1.39 | 2.43 | 0 | 5.9 | 1.39 | 2.43 | 0 16384 | 16384 | uint8 | sum | -1 | 6.6 | 2.48 | 4.35 | 0 | 6.64 | 2.47 | 4.32 | 0 32768 | 32768 | uint8 | sum | -1 | 8.99 | 3.65 | 6.38 | 0 | 8.99 | 3.64 | 6.38 | 0 65536 | 65536 | uint8 | sum | -1 | 9.44 | 6.94 | 12.15 | 0 | 9.58 | 6.84 | 11.98 | 0 131072 | 131072 | uint8 | sum | -1 | 11.72 | 11.18 | 19.57 | 0 | 11.83 | 11.08 | 19.4 | 0 262144 | 262144 | uint8 | sum | -1 | 12.29 | 21.32 | 37.31 | 0 | 12.45 | 21.05 | 36.84 | 0 524288 | 524288 | uint8 | sum | -1 | 13.87 | 37.8 | 66.15 | 0 | 13.93 | 37.64 | 65.88 | 0 1048576 | 1048576 | uint8 | sum | -1 | 19.11 | 54.88 | 96.04 | 0 | 19.3 | 54.33 | 95.08 | 0 2097152 | 2097152 | uint8 | sum | -1 | 24.38 | 86.01 | 150.51 | 0 | 24.52 | 85.53 | 149.67 | 0 4194304 | 4194304 | uint8 | sum | -1 | 37.52 | 111.78 | 195.61 | 0 | 37.76 | 111.08 | 194.39 | 0 8388608 | 8388608 | uint8 | sum | -1 | 62.4 | 134.44 | 235.26 | 0 | 62.56 | 134.1 | 234.67 | 0 16777216 | 16777216 | uint8 | sum | -1 | 110.2 | 152.22 | 266.39 | 0 | 110.3 | 152.04 | 266.08 | 0 33554432 | 33554432 | uint8 | sum | -1 | 199.8 | 167.94 | 293.9 | 0 | 197.5 | 169.88 | 297.29 | 0 67108864 | 67108864 | uint8 | sum | -1 | 386.3 | 173.73 | 304.03 | 0 | 378.4 | 177.37 | 310.39 | 0 134217728 | 134217728 | uint8 | sum | -1 | 758 | 177.07 | 309.87 | 0 | 741.1 | 181.12 | 316.95 | 0 268435456 | 268435456 | uint8 | sum | -1 | 1500.1 | 178.95 | 313.16 | 0 | 1466.2 | 183.09 | 320.4 | 0 536870912 | 536870912 | uint8 | sum | -1 | 2991.7 | 179.45 | 314.04 | 0 | 2924.8 | 183.56 | 321.23 | 0 --------- Co-authored-by: Qinghua Zhou <qinghuahzhou@microsoft.com>
This commit is contained in:
@@ -32,6 +32,17 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo
|
||||
NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
|
||||
#else
|
||||
);
|
||||
#endif
|
||||
break;
|
||||
case DataType::UINT8:
|
||||
executionKernel<uint8_t, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
|
||||
rank, (uint8_t*)src, (uint8_t*)dst, (uint8_t*)scratch, scratchOffset, scratchChunkSize, plan, semaphores,
|
||||
localMemoryIdBegin, flag
|
||||
#if defined(ENABLE_NPKIT)
|
||||
,
|
||||
NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
|
||||
#else
|
||||
);
|
||||
#endif
|
||||
break;
|
||||
case DataType::FLOAT16:
|
||||
|
||||
Reference in New Issue
Block a user