Files
mscclpp/test/deploy/perf_ndmv4.jsonl
Binyang2014 952f2da9cc Improve single node allreduce performance (#169)
Improve all reduce performance for single node.
New number:
|   n_ctx | size    |  target latency (us) | allreduce5 | allreduce6 |
|---------|---------|----------------|------------|------------|
|       1 | 24.0kB  |            7.7 |            |        7.23|
|       2 | 48.0kB  |            7.7 |            |        7.69|
|       4 | 96.0kB  |            8   |            |        8.34|
|       8 | 192.0kB |           12.6 |            |        9.75|
|      12 | 288.0kB |           13   |            |       11.34|
|      16 | 384.0kB |           13.3 |            |       12.99|
|     768 | 18.0MB  |          158.7 |       160.3|            |
|     896 | 21.0MB  |          184.5 |       183.8|            |
|    1024 | 24.0MB  |          209.5 |       207.5|            |
|    1152 | 27.0MB  |          234.3 |       231.9|            |
|    1280 | 30.0MB  |          260   |       255.6|            |
|    1408 | 33.0MB  |          284.9 |       278.7|            |
|    1536 | 36.0MB  |          310.3 |       302.0|            |
|    1664 | 39.0MB  |          336.2 |       325.3|            |
|    1792 | 42.0MB  |          361.4 |       348.8|            |
|    1920 | 45.0MB  |          384.6 |       372.2|            |
|    2048 | 48.0MB  |          409.1 |       395.4|            |

---------

Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
2023-09-13 14:30:08 +00:00

20 lines
2.8 KiB
JSON

{"name":"allgather", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":291.52, "busBw":255.08, "size":1073741824, "time":3683.13, "target":"throughput"}
{"name":"allgather", "kernel":2, "ranks":16,"ranksPerNode":8, "algBw":244.61, "busBw":229.33, "size":3221225472, "time":13168.31,"target":"throughput"}
{"name":"allgather", "kernel":3, "ranks":8, "ranksPerNode":8, "algBw":0.1112, "busBw":0.0973, "size":8192, "time":73.63, "target":"latency"}
{"name":"allreduce", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":139.41, "busBw":243.96, "size":1073741824, "time":7701.98, "target":"throughput"}
{"name":"allreduce", "kernel":2, "ranks":8, "ranksPerNode":8, "algBw":1.25, "busBw":2.19, "size":8192, "time":6.51, "target":"latency"}
{"name":"allreduce", "kernel":2, "ranks":16,"ranksPerNode":8, "algBw":0.51, "busBw":0.96, "size":8192, "time":15.96, "target":"latency"}
{"name":"allreduce", "kernel":3, "ranks":8, "ranksPerNode":8, "algBw":139.08, "busBw":243.40, "size":1073741824, "time":7719.85, "target":"throughput"}
{"name":"allreduce", "kernel":4, "ranks":8, "ranksPerNode":8, "algBw":106.98, "busBw":187.22, "size":16777216, "time":156.81, "target":"throughput"}
{"name":"allreduce", "kernel":4, "ranks":8, "ranksPerNode":8, "algBw":116.24, "busBw":203.42, "size":33554432, "time":288.65, "target":"throughput"}
{"name":"allreduce", "kernel":5, "ranks":8, "ranksPerNode":8, "algBw":126.52,"busBw":221.418,"size":50331648, "time":397.79, "target":"throughput"}
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":3.3919,"busBw":5.9359, "size":24576, "time":7.24, "target":"latency"}
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":6.21, "busBw":10.87, "size":49152, "time":7.91, "target":"latency"}
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":8.90, "busBw":15.57, "size":73728, "time":8.28, "target":"latency"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":84.55, "busBw":158.53, "size":25165824, "time":297.64, "target":"throughput"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":99.43, "busBw":186.44, "size":50331648, "time":506.16, "target":"throughput"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":124.60, "busBw":233.64, "size":3221225472, "time":25850.67,"target":"throughput"}
{"name":"allreduce", "kernel":3, "ranks":16,"ranksPerNode":8, "algBw":119.5, "busBw":224.06, "size":3221225472, "time":26955.85,"target":"throughput"}
{"name":"alltoall", "kernel":0, "ranks":16,"ranksPerNode":8, "algBw":46.53, "busBw":43.63, "size":1073741824, "time":23071.5, "target":"throughput"}
{"name":"alltoall", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":276.17, "busBw":241.65, "size":1073741824, "time":3887.87, "target":"throughput"}