Files
mscclpp/test/execution-files/allreduce_packet.json
Binyang Li be6a941fba New DSL implementation (#579)
The PR contains following changes:
Python side:
- Channel based DSL implementation: decouple channel with chunk.
- Users create channel explicitly, only need local_rank, remote_rank and
channel_type
- Adjust executor json file, add remote_buffer fields, different op can
use different channel and remote buffers combination.
- Reimplement operation fusion, data dependency check mechanism
- Add new op such as semaphore, pipeline 
- Clean code and enhance document
C++ side: 
- Support new execution file json format
- Support semaphore and pipeline operation
- code clean, support non-zero copy scenario

---------

Co-authored-by: Caio Rocha <caiorocha@microsoft.com>
Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
2025-08-09 00:36:20 -07:00

337 lines
7.4 KiB
JSON

{
"name": "allreduce_pkt",
"collective": "allreduce",
"protocol": "LL",
"inplace": true,
"reuse_resources": false,
"gpus": [
{
"id": 0,
"input_chunks": 4,
"output_chunks": 4,
"scratch_chunks": 4,
"threadblocks": [
{
"id": 0,
"ops": [
{
"name": "ppkt",
"src_buff": [
{
"type": "i",
"index": 2,
"size": 2
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 0,
"size": 2
}
],
"channel_type": "memory"
},
{
"name": "respkt",
"src_buff": [
{
"type": "i",
"index": 0,
"size": 1
},
{
"type": "s",
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 0,
"size": 1
},
{
"buffer_id": 0,
"index": 2,
"size": 1
}
],
"channel_type": "memory",
"reduce_op": "sum"
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 1,
"ops": [
{
"name": "respkt",
"src_buff": [
{
"type": "i",
"index": 1,
"size": 1
},
{
"type": "s",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 1,
"size": 1
},
{
"buffer_id": 0,
"index": 3,
"size": 1
}
],
"channel_type": "memory",
"reduce_op": "sum"
},
{
"name": "upkt",
"src_buff": [
{
"type": "s",
"index": 2,
"size": 2
}
],
"dst_buff": [
{
"type": "i",
"index": 2,
"size": 2
}
]
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
}
],
"channels": [
{
"channel_type": "memory",
"connected_to": [
1
]
}
],
"remote_buffers": [
{
"rank": 1,
"type": "s",
"access_channel_types": [
"memory"
]
}
],
"semaphores": []
},
{
"id": 1,
"input_chunks": 4,
"output_chunks": 4,
"scratch_chunks": 4,
"threadblocks": [
{
"id": 0,
"ops": [
{
"name": "ppkt",
"src_buff": [
{
"type": "i",
"index": 0,
"size": 2
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 0,
"size": 2
}
],
"channel_type": "memory"
},
{
"name": "respkt",
"src_buff": [
{
"type": "i",
"index": 2,
"size": 1
},
{
"type": "s",
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 2,
"size": 1
},
{
"buffer_id": 0,
"index": 2,
"size": 1
}
],
"channel_type": "memory",
"reduce_op": "sum"
},
{
"name": "upkt",
"src_buff": [
{
"type": "s",
"index": 2,
"size": 2
}
],
"dst_buff": [
{
"type": "i",
"index": 0,
"size": 2
}
]
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 1,
"ops": [
{
"name": "respkt",
"src_buff": [
{
"type": "i",
"index": 3,
"size": 1
},
{
"type": "s",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"type": "i",
"index": 3,
"size": 1
},
{
"buffer_id": 0,
"index": 3,
"size": 1
}
],
"channel_type": "memory",
"reduce_op": "sum"
}
],
"channels": [
{
"channel_type": "memory",
"channel_ids": [
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "memory",
"remote_buffer_ids": [
0
]
}
]
}
],
"channels": [
{
"channel_type": "memory",
"connected_to": [
0
]
}
],
"remote_buffers": [
{
"rank": 0,
"type": "s",
"access_channel_types": [
"memory"
]
}
],
"semaphores": []
}
],
"num_threads_per_block": 1024,
"use_double_scratch_buffer": true,
"buffer_alignment": 16,
"min_message_size": 0,
"max_message_size": 18446744073709551615
}