mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-13 09:46:00 +00:00
Improve all reduce performance for single node. New number: | n_ctx | size | target latency (us) | allreduce5 | allreduce6 | |---------|---------|----------------|------------|------------| | 1 | 24.0kB | 7.7 | | 7.23| | 2 | 48.0kB | 7.7 | | 7.69| | 4 | 96.0kB | 8 | | 8.34| | 8 | 192.0kB | 12.6 | | 9.75| | 12 | 288.0kB | 13 | | 11.34| | 16 | 384.0kB | 13.3 | | 12.99| | 768 | 18.0MB | 158.7 | 160.3| | | 896 | 21.0MB | 184.5 | 183.8| | | 1024 | 24.0MB | 209.5 | 207.5| | | 1152 | 27.0MB | 234.3 | 231.9| | | 1280 | 30.0MB | 260 | 255.6| | | 1408 | 33.0MB | 284.9 | 278.7| | | 1536 | 36.0MB | 310.3 | 302.0| | | 1664 | 39.0MB | 336.2 | 325.3| | | 1792 | 42.0MB | 361.4 | 348.8| | | 1920 | 45.0MB | 384.6 | 372.2| | | 2048 | 48.0MB | 409.1 | 395.4| | --------- Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
20 lines
2.8 KiB
JSON
20 lines
2.8 KiB
JSON
{"name":"allgather", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":291.52, "busBw":255.08, "size":1073741824, "time":3683.13, "target":"throughput"}
|
|
{"name":"allgather", "kernel":2, "ranks":16,"ranksPerNode":8, "algBw":244.61, "busBw":229.33, "size":3221225472, "time":13168.31,"target":"throughput"}
|
|
{"name":"allgather", "kernel":3, "ranks":8, "ranksPerNode":8, "algBw":0.1112, "busBw":0.0973, "size":8192, "time":73.63, "target":"latency"}
|
|
{"name":"allreduce", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":139.41, "busBw":243.96, "size":1073741824, "time":7701.98, "target":"throughput"}
|
|
{"name":"allreduce", "kernel":2, "ranks":8, "ranksPerNode":8, "algBw":1.25, "busBw":2.19, "size":8192, "time":6.51, "target":"latency"}
|
|
{"name":"allreduce", "kernel":2, "ranks":16,"ranksPerNode":8, "algBw":0.51, "busBw":0.96, "size":8192, "time":15.96, "target":"latency"}
|
|
{"name":"allreduce", "kernel":3, "ranks":8, "ranksPerNode":8, "algBw":139.08, "busBw":243.40, "size":1073741824, "time":7719.85, "target":"throughput"}
|
|
{"name":"allreduce", "kernel":4, "ranks":8, "ranksPerNode":8, "algBw":106.98, "busBw":187.22, "size":16777216, "time":156.81, "target":"throughput"}
|
|
{"name":"allreduce", "kernel":4, "ranks":8, "ranksPerNode":8, "algBw":116.24, "busBw":203.42, "size":33554432, "time":288.65, "target":"throughput"}
|
|
{"name":"allreduce", "kernel":5, "ranks":8, "ranksPerNode":8, "algBw":126.52,"busBw":221.418,"size":50331648, "time":397.79, "target":"throughput"}
|
|
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":3.3919,"busBw":5.9359, "size":24576, "time":7.24, "target":"latency"}
|
|
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":6.21, "busBw":10.87, "size":49152, "time":7.91, "target":"latency"}
|
|
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":8.90, "busBw":15.57, "size":73728, "time":8.28, "target":"latency"}
|
|
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":84.55, "busBw":158.53, "size":25165824, "time":297.64, "target":"throughput"}
|
|
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":99.43, "busBw":186.44, "size":50331648, "time":506.16, "target":"throughput"}
|
|
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":124.60, "busBw":233.64, "size":3221225472, "time":25850.67,"target":"throughput"}
|
|
{"name":"allreduce", "kernel":3, "ranks":16,"ranksPerNode":8, "algBw":119.5, "busBw":224.06, "size":3221225472, "time":26955.85,"target":"throughput"}
|
|
{"name":"alltoall", "kernel":0, "ranks":16,"ranksPerNode":8, "algBw":46.53, "busBw":43.63, "size":1073741824, "time":23071.5, "target":"throughput"}
|
|
{"name":"alltoall", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":276.17, "busBw":241.65, "size":1073741824, "time":3887.87, "target":"throughput"}
|