mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 01:10:22 +00:00
The PR contains following changes: Python side: - Channel based DSL implementation: decouple channel with chunk. - Users create channel explicitly, only need local_rank, remote_rank and channel_type - Adjust executor json file, add remote_buffer fields, different op can use different channel and remote buffers combination. - Reimplement operation fusion, data dependency check mechanism - Add new op such as semaphore, pipeline - Clean code and enhance document C++ side: - Support new execution file json format - Support semaphore and pipeline operation - code clean, support non-zero copy scenario --------- Co-authored-by: Caio Rocha <caiorocha@microsoft.com> Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
602 lines
20 KiB
JSON
602 lines
20 KiB
JSON
{
|
|
"name": "allreduce_nvls",
|
|
"collective": "allreduce",
|
|
"protocol": "Simple",
|
|
"inplace": true,
|
|
"reuse_resources": true,
|
|
"gpus": [
|
|
{
|
|
"id": 0,
|
|
"input_chunks": 4,
|
|
"output_chunks": 0,
|
|
"scratch_chunks": 4,
|
|
"threadblocks": [
|
|
{
|
|
"id": 0,
|
|
"ops": [
|
|
{
|
|
"name": "pipeline",
|
|
"iter_context": {"unit_size": 524288, "num_chunks": 1},
|
|
"ops": [
|
|
{"name": "sem_acquire", "semaphore_ids": [2]},
|
|
{"name": "nop"},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "i", "index": 0, "size": 1} ],
|
|
"dst_buff": [ {"type": "s", "index": 0, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "i", "index": 1, "size": 1} ],
|
|
"dst_buff": [ {"type": "s", "index": 1, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "i", "index": 2, "size": 1} ],
|
|
"dst_buff": [ {"type": "s", "index": 2, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "i", "index": 3, "size": 1} ],
|
|
"dst_buff": [ {"type": "s", "index": 3, "size": 1} ]
|
|
},
|
|
{"name": "nop"},
|
|
{
|
|
"name": "signal",
|
|
"channel_ids": [0, 1, 2],
|
|
"channel_type": "memory"
|
|
},
|
|
{
|
|
"name": "wait",
|
|
"channel_ids": [0, 1, 2],
|
|
"channel_type": "memory"
|
|
},
|
|
{"name": "nop"},
|
|
{"name": "sem_release", "semaphore_ids": [0]}
|
|
]
|
|
}
|
|
],
|
|
"channels": [
|
|
{ "channel_type": "memory", "channel_ids": [0, 1, 2] }
|
|
]
|
|
},
|
|
{
|
|
"id": 1,
|
|
"ops": [
|
|
{
|
|
"name": "pipeline",
|
|
"iter_context": {"unit_size": 524288, "num_chunks": 1},
|
|
"ops": [
|
|
{"name": "sem_acquire", "semaphore_ids": [0]},
|
|
{"name": "nop"},
|
|
{
|
|
"name": "glres",
|
|
"channel_ids": [0],
|
|
"src_buff": [
|
|
{"switch_channel_id": 0, "index": 0, "size": 1}
|
|
],
|
|
"dst_buff": [
|
|
{"switch_channel_id": 0, "index": 0, "size": 1}
|
|
],
|
|
"channel_type": "switch"
|
|
},
|
|
{"name": "nop"},
|
|
{"name": "sem_release", "semaphore_ids": [1]}
|
|
]
|
|
}
|
|
],
|
|
"channels": [
|
|
{ "buff": "s", "channel_type": "switch", "channel_ids": [0] }
|
|
]
|
|
},
|
|
{
|
|
"id": 2,
|
|
"ops": [
|
|
{
|
|
"name": "pipeline",
|
|
"iter_context": {"unit_size": 524288, "num_chunks": 1},
|
|
"ops": [
|
|
{"name": "sem_acquire", "semaphore_ids": [1]},
|
|
{"name": "nop"},
|
|
{
|
|
"name": "signal",
|
|
"channel_ids": [0, 1, 2],
|
|
"channel_type": "memory"
|
|
},
|
|
{
|
|
"name": "wait",
|
|
"channel_ids": [0, 1, 2],
|
|
"channel_type": "memory"
|
|
},
|
|
{"name": "nop"},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "s", "index": 0, "size": 1} ],
|
|
"dst_buff": [ {"type": "i", "index": 0, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "s", "index": 1, "size": 1} ],
|
|
"dst_buff": [ {"type": "i", "index": 1, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "s", "index": 2, "size": 1} ],
|
|
"dst_buff": [ {"type": "i", "index": 2, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "s", "index": 3, "size": 1} ],
|
|
"dst_buff": [ {"type": "i", "index": 3, "size": 1} ]
|
|
},
|
|
{"name": "nop"},
|
|
{"name": "sem_release", "semaphore_ids": [2]}
|
|
]
|
|
}
|
|
],
|
|
"channels": [
|
|
{ "channel_type": "memory", "channel_ids": [3, 4, 5] }
|
|
]
|
|
}
|
|
],
|
|
"channels": [
|
|
{
|
|
"buff": "s",
|
|
"channel_type": "switch",
|
|
"rank_groups": [
|
|
{ "size": 4, "ranks": [0, 1, 2, 3] }
|
|
]
|
|
},
|
|
{ "channel_type": "memory", "connected_to": [1, 2, 3, 1, 2, 3] }
|
|
],
|
|
"remote_buffers": [],
|
|
"semaphores": [ {"init_value": 0}, {"init_value": 0}, {"init_value": 32} ]
|
|
},
|
|
{
|
|
"id": 1,
|
|
"input_chunks": 4,
|
|
"output_chunks": 0,
|
|
"scratch_chunks": 4,
|
|
"threadblocks": [
|
|
{
|
|
"id": 0,
|
|
"ops": [
|
|
{
|
|
"name": "pipeline",
|
|
"iter_context": {"unit_size": 524288, "num_chunks": 1},
|
|
"ops": [
|
|
{"name": "sem_acquire", "semaphore_ids": [2]},
|
|
{"name": "nop"},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "i", "index": 0, "size": 1} ],
|
|
"dst_buff": [ {"type": "s", "index": 0, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "i", "index": 1, "size": 1} ],
|
|
"dst_buff": [ {"type": "s", "index": 1, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "i", "index": 2, "size": 1} ],
|
|
"dst_buff": [ {"type": "s", "index": 2, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "i", "index": 3, "size": 1} ],
|
|
"dst_buff": [ {"type": "s", "index": 3, "size": 1} ]
|
|
},
|
|
{"name": "nop"},
|
|
{
|
|
"name": "signal",
|
|
"channel_ids": [0, 1, 2],
|
|
"channel_type": "memory"
|
|
},
|
|
{
|
|
"name": "wait",
|
|
"channel_ids": [0, 1, 2],
|
|
"channel_type": "memory"
|
|
},
|
|
{"name": "nop"},
|
|
{"name": "sem_release", "semaphore_ids": [0]}
|
|
]
|
|
}
|
|
],
|
|
"channels": [
|
|
{ "channel_type": "memory", "channel_ids": [0, 1, 2] }
|
|
]
|
|
},
|
|
{
|
|
"id": 1,
|
|
"ops": [
|
|
{
|
|
"name": "pipeline",
|
|
"iter_context": {"unit_size": 524288, "num_chunks": 1},
|
|
"ops": [
|
|
{"name": "sem_acquire", "semaphore_ids": [0]},
|
|
{"name": "nop"},
|
|
{
|
|
"name": "glres",
|
|
"channel_ids": [0],
|
|
"src_buff": [
|
|
{"switch_channel_id": 0, "index": 1, "size": 1}
|
|
],
|
|
"dst_buff": [
|
|
{"switch_channel_id": 0, "index": 1, "size": 1}
|
|
],
|
|
"channel_type": "switch"
|
|
},
|
|
{"name": "nop"},
|
|
{"name": "sem_release", "semaphore_ids": [1]}
|
|
]
|
|
}
|
|
],
|
|
"channels": [
|
|
{ "buff": "s", "channel_type": "switch", "channel_ids": [0] }
|
|
]
|
|
},
|
|
{
|
|
"id": 2,
|
|
"ops": [
|
|
{
|
|
"name": "pipeline",
|
|
"iter_context": {"unit_size": 524288, "num_chunks": 1},
|
|
"ops": [
|
|
{"name": "sem_acquire", "semaphore_ids": [1]},
|
|
{"name": "nop"},
|
|
{
|
|
"name": "signal",
|
|
"channel_ids": [0, 1, 2],
|
|
"channel_type": "memory"
|
|
},
|
|
{
|
|
"name": "wait",
|
|
"channel_ids": [0, 1, 2],
|
|
"channel_type": "memory"
|
|
},
|
|
{"name": "nop"},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "s", "index": 0, "size": 1} ],
|
|
"dst_buff": [ {"type": "i", "index": 0, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "s", "index": 1, "size": 1} ],
|
|
"dst_buff": [ {"type": "i", "index": 1, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "s", "index": 2, "size": 1} ],
|
|
"dst_buff": [ {"type": "i", "index": 2, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "s", "index": 3, "size": 1} ],
|
|
"dst_buff": [ {"type": "i", "index": 3, "size": 1} ]
|
|
},
|
|
{"name": "nop"},
|
|
{"name": "sem_release", "semaphore_ids": [2]}
|
|
|
|
]
|
|
}
|
|
],
|
|
"channels": [
|
|
{ "channel_type": "memory", "channel_ids": [3, 4, 5] }
|
|
]
|
|
}
|
|
],
|
|
"channels": [
|
|
{
|
|
"buff": "s",
|
|
"channel_type": "switch",
|
|
"rank_groups": [
|
|
{ "size": 4, "ranks": [0, 1, 2, 3] }
|
|
]
|
|
},
|
|
{ "channel_type": "memory", "connected_to": [0, 2, 3, 0, 2, 3] }
|
|
],
|
|
"remote_buffers": [],
|
|
"semaphores": [ {"init_value": 0}, {"init_value": 0}, {"init_value": 32} ]
|
|
},
|
|
{
|
|
"id": 2,
|
|
"input_chunks": 4,
|
|
"output_chunks": 0,
|
|
"scratch_chunks": 4,
|
|
"threadblocks": [
|
|
{
|
|
"id": 0,
|
|
"ops": [
|
|
{
|
|
"name": "pipeline",
|
|
"iter_context": {"unit_size": 524288, "num_chunks": 1},
|
|
"ops": [
|
|
{"name": "sem_acquire", "semaphore_ids": [2]},
|
|
{"name": "nop"},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "i", "index": 0, "size": 1} ],
|
|
"dst_buff": [ {"type": "s", "index": 0, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "i", "index": 1, "size": 1} ],
|
|
"dst_buff": [ {"type": "s", "index": 1, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "i", "index": 2, "size": 1} ],
|
|
"dst_buff": [ {"type": "s", "index": 2, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "i", "index": 3, "size": 1} ],
|
|
"dst_buff": [ {"type": "s", "index": 3, "size": 1} ]
|
|
},
|
|
{"name": "nop"},
|
|
{
|
|
"name": "signal",
|
|
"channel_ids": [0, 1, 2],
|
|
"channel_type": "memory"
|
|
},
|
|
{
|
|
"name": "wait",
|
|
"channel_ids": [0, 1, 2],
|
|
"channel_type": "memory"
|
|
},
|
|
{"name": "nop"},
|
|
{"name": "sem_release", "semaphore_ids": [0]}
|
|
]
|
|
}
|
|
],
|
|
"channels": [
|
|
{ "channel_type": "memory", "channel_ids": [0, 1, 2] }
|
|
]
|
|
},
|
|
{
|
|
"id": 1,
|
|
"ops": [
|
|
{
|
|
"name": "pipeline",
|
|
"iter_context": {"unit_size": 524288, "num_chunks": 1},
|
|
"ops": [
|
|
{"name": "sem_acquire", "semaphore_ids": [0]},
|
|
{"name": "nop"},
|
|
{
|
|
"name": "glres",
|
|
"channel_ids": [0],
|
|
"src_buff": [
|
|
{"switch_channel_id": 0, "index": 2, "size": 1}
|
|
],
|
|
"dst_buff": [
|
|
{"switch_channel_id": 0, "index": 2, "size": 1}
|
|
],
|
|
"channel_type": "switch"
|
|
},
|
|
{"name": "nop"},
|
|
{"name": "sem_release", "semaphore_ids": [1]}
|
|
]
|
|
}
|
|
],
|
|
"channels": [
|
|
{ "buff": "s", "channel_type": "switch", "channel_ids": [0] }
|
|
]
|
|
},
|
|
{
|
|
"id": 2,
|
|
"ops": [
|
|
{
|
|
"name": "pipeline",
|
|
"iter_context": {"unit_size": 524288, "num_chunks": 1},
|
|
"ops": [
|
|
{"name": "sem_acquire", "semaphore_ids": [1]},
|
|
{"name": "nop"},
|
|
{
|
|
"name": "signal",
|
|
"channel_ids": [0, 1, 2],
|
|
"channel_type": "memory"
|
|
},
|
|
{
|
|
"name": "wait",
|
|
"channel_ids": [0, 1, 2],
|
|
"channel_type": "memory"
|
|
},
|
|
{"name": "nop"},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "s", "index": 0, "size": 1} ],
|
|
"dst_buff": [ {"type": "i", "index": 0, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "s", "index": 1, "size": 1} ],
|
|
"dst_buff": [ {"type": "i", "index": 1, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "s", "index": 2, "size": 1} ],
|
|
"dst_buff": [ {"type": "i", "index": 2, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "s", "index": 3, "size": 1} ],
|
|
"dst_buff": [ {"type": "i", "index": 3, "size": 1} ]
|
|
},
|
|
{"name": "nop"},
|
|
{"name": "sem_release", "semaphore_ids": [2]}
|
|
|
|
]
|
|
}
|
|
],
|
|
"channels": [
|
|
{ "channel_type": "memory", "channel_ids": [3, 4, 5] }
|
|
]
|
|
}
|
|
],
|
|
"channels": [
|
|
{
|
|
"buff": "s",
|
|
"channel_type": "switch",
|
|
"rank_groups": [
|
|
{ "size": 4, "ranks": [0, 1, 2, 3] }
|
|
]
|
|
},
|
|
{ "channel_type": "memory", "connected_to": [0, 1, 3, 0, 1, 3] }
|
|
],
|
|
"remote_buffers": [],
|
|
"semaphores": [ {"init_value": 0}, {"init_value": 0}, {"init_value": 32} ]
|
|
},
|
|
{
|
|
"id": 3,
|
|
"input_chunks": 4,
|
|
"output_chunks": 0,
|
|
"scratch_chunks": 4,
|
|
"threadblocks": [
|
|
{
|
|
"id": 0,
|
|
"ops": [
|
|
{
|
|
"name": "pipeline",
|
|
"iter_context": {"unit_size": 524288, "num_chunks": 1},
|
|
"ops": [
|
|
{"name": "sem_acquire", "semaphore_ids": [2]},
|
|
{"name": "nop"},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "i", "index": 0, "size": 1} ],
|
|
"dst_buff": [ {"type": "s", "index": 0, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "i", "index": 1, "size": 1} ],
|
|
"dst_buff": [ {"type": "s", "index": 1, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "i", "index": 2, "size": 1} ],
|
|
"dst_buff": [ {"type": "s", "index": 2, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "i", "index": 3, "size": 1} ],
|
|
"dst_buff": [ {"type": "s", "index": 3, "size": 1} ]
|
|
},
|
|
{"name": "nop"},
|
|
{
|
|
"name": "signal",
|
|
"channel_ids": [0, 1, 2],
|
|
"channel_type": "memory"
|
|
},
|
|
{
|
|
"name": "wait",
|
|
"channel_ids": [0, 1, 2],
|
|
"channel_type": "memory"
|
|
},
|
|
{"name": "nop"},
|
|
{"name": "sem_release", "semaphore_ids": [0]}
|
|
]
|
|
}
|
|
],
|
|
"channels": [
|
|
{ "channel_type": "memory", "channel_ids": [0, 1, 2] }
|
|
]
|
|
},
|
|
{
|
|
"id": 1,
|
|
"ops": [
|
|
{
|
|
"name": "pipeline",
|
|
"iter_context": {"unit_size": 524288, "num_chunks": 1},
|
|
"ops": [
|
|
{"name": "sem_acquire", "semaphore_ids": [0]},
|
|
{"name": "nop"},
|
|
{
|
|
"name": "glres",
|
|
"channel_ids": [0],
|
|
"src_buff": [
|
|
{"switch_channel_id": 0, "index": 3, "size": 1}
|
|
],
|
|
"dst_buff": [
|
|
{"switch_channel_id": 0, "index": 3, "size": 1}
|
|
],
|
|
"channel_type": "switch"
|
|
},
|
|
{"name": "nop"},
|
|
{"name": "sem_release", "semaphore_ids": [1]}
|
|
]
|
|
}
|
|
],
|
|
"channels": [
|
|
{ "buff": "s", "channel_type": "switch", "channel_ids": [0] }
|
|
]
|
|
},
|
|
{
|
|
"id": 2,
|
|
"ops": [
|
|
{
|
|
"name": "pipeline",
|
|
"iter_context": {"unit_size": 524288, "num_chunks": 1},
|
|
"ops": [
|
|
{"name": "sem_acquire", "semaphore_ids": [1]},
|
|
{"name": "nop"},
|
|
{
|
|
"name": "signal",
|
|
"channel_ids": [0, 1, 2],
|
|
"channel_type": "memory"
|
|
},
|
|
{
|
|
"name": "wait",
|
|
"channel_ids": [0, 1, 2],
|
|
"channel_type": "memory"
|
|
},
|
|
{"name": "nop"},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "s", "index": 0, "size": 1} ],
|
|
"dst_buff": [ {"type": "i", "index": 0, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "s", "index": 1, "size": 1} ],
|
|
"dst_buff": [ {"type": "i", "index": 1, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "s", "index": 2, "size": 1} ],
|
|
"dst_buff": [ {"type": "i", "index": 2, "size": 1} ]
|
|
},
|
|
{
|
|
"name": "copy",
|
|
"src_buff": [ {"type": "s", "index": 3, "size": 1} ],
|
|
"dst_buff": [ {"type": "i", "index": 3, "size": 1} ]
|
|
},
|
|
{"name": "nop"},
|
|
{"name": "sem_release", "semaphore_ids": [2]}
|
|
]
|
|
}
|
|
],
|
|
"channels": [
|
|
{ "channel_type": "memory", "channel_ids": [3, 4, 5] }
|
|
]
|
|
}
|
|
],
|
|
"channels": [
|
|
{
|
|
"buff": "s",
|
|
"channel_type": "switch",
|
|
"rank_groups": [
|
|
{ "size": 4, "ranks": [0, 1, 2, 3] }
|
|
]
|
|
},
|
|
{ "channel_type": "memory", "connected_to": [0, 1, 2, 0, 1, 2] }
|
|
],
|
|
"remote_buffers": [],
|
|
"semaphores": [ {"init_value": 0}, {"init_value": 0}, {"init_value": 32} ]
|
|
}
|
|
],
|
|
"num_threads_per_block": 1024,
|
|
"use_double_scratch_buffer": false
|
|
}
|