Files
mscclpp/python/mscclpp/__main__.py
Ekow Wellington fd76507e9a Install default plans under MSCCLPP_CACHE_DIR/default (#769)
### Summary
Update the installer to place bundled default execution plans under
`<MSCCLPP_CACHE_DIR>/default`, which is where the runtime already looks
for bundled plans.

### Background
The C++ runtime treats `MSCCLPP_CACHE_DIR` as the cache *root* and loads
bundled default plans from `<cache root>/default`.
When `MSCCLPP_CACHE_DIR` was set, the installer instead wrote bundled
plans
directly into the cache root, causing the runtime to miss them.

This surfaced while running benchmarking tests with a non-default
`MSCCLPP_CACHE_DIR`, where the bundled plans were not being discovered.

### Change
This PR updates the installer to always install bundled default plans
into
`<MSCCLPP_CACHE_DIR>/default`, preserving the existing runtime contract.

### Scope
- Installer-only change
- No runtime behavior changes

### Validation
Manual inspection of the updated install path.
Successful build

---------

Co-authored-by: Ekow Wellington <t-ekoww@microsoft.com>
2026-03-31 14:27:33 -05:00

99 lines
2.8 KiB
Python

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import os
import shutil
import argparse
from pathlib import Path
from mscclpp import default_algos as def_algo
from mscclpp.language.collectives import *
from mscclpp.language.utils import AlgoSpec
default_algo_configs = [
{
"filename": "allreduce_2nodes_1K_64K.json",
"function": def_algo.allreduce_2nodes,
"spec": AlgoSpec(
name="allreduce_2nodes_1K_64K",
collective=AllReduce(16, 1, True),
nranks_per_node=8,
world_size=16,
in_place=True,
instances=1,
protocol="LL",
auto_sync=False,
num_threads_per_block=1024,
reuse_resources=True,
use_double_scratch_buffer=True,
min_message_size=1 << 10,
max_message_size=64 << 10,
tags={"default": 1},
),
"additional_kwargs": {"thread_block_group_size": 1},
},
{
"filename": "allreduce_2nodes_128K_2M.json",
"function": def_algo.allreduce_2nodes,
"spec": AlgoSpec(
name="allreduce_2nodes_128K_2M",
collective=AllReduce(16, 1, True),
nranks_per_node=8,
world_size=16,
in_place=True,
instances=1,
protocol="LL",
auto_sync=False,
num_threads_per_block=1024,
reuse_resources=True,
use_double_scratch_buffer=True,
min_message_size=128 << 10,
max_message_size=2 << 20,
tags={"default": 1},
),
"additional_kwargs": {"thread_block_group_size": 4},
},
]
def create_default_plans():
plan_dir = os.path.join(os.environ.get("MSCCLPP_CACHE_DIR", Path.home() / ".cache/mscclpp"), "default")
plan_path = Path(plan_dir)
if plan_path.exists():
shutil.rmtree(plan_path)
plan_path.mkdir(parents=True)
for config in default_algo_configs:
filename = config["filename"]
func = config["function"]
spec = config["spec"]
additional_kwargs = config.get("additional_kwargs", {})
plan_path = os.path.join(plan_dir, filename)
try:
if additional_kwargs:
prog = func(spec, **additional_kwargs)
else:
prog = func(spec)
with open(plan_path, "w", encoding="utf-8") as f:
f.write(prog.to_json())
f.flush()
except Exception as e:
print(f"Error creating plan for {spec.name}: {e}")
continue
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--install", action="store_true", help="flag to install default plans")
args = parser.parse_args()
if args.install:
create_default_plans()
if __name__ == "__main__":
main()