mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 17:26:04 +00:00
### Summary Update the installer to place bundled default execution plans under `<MSCCLPP_CACHE_DIR>/default`, which is where the runtime already looks for bundled plans. ### Background The C++ runtime treats `MSCCLPP_CACHE_DIR` as the cache *root* and loads bundled default plans from `<cache root>/default`. When `MSCCLPP_CACHE_DIR` was set, the installer instead wrote bundled plans directly into the cache root, causing the runtime to miss them. This surfaced while running benchmarking tests with a non-default `MSCCLPP_CACHE_DIR`, where the bundled plans were not being discovered. ### Change This PR updates the installer to always install bundled default plans into `<MSCCLPP_CACHE_DIR>/default`, preserving the existing runtime contract. ### Scope - Installer-only change - No runtime behavior changes ### Validation Manual inspection of the updated install path. Successful build --------- Co-authored-by: Ekow Wellington <t-ekoww@microsoft.com>
99 lines
2.8 KiB
Python
99 lines
2.8 KiB
Python
# Copyright (c) Microsoft Corporation.
|
|
# Licensed under the MIT License.
|
|
|
|
import os
|
|
import shutil
|
|
import argparse
|
|
from pathlib import Path
|
|
|
|
from mscclpp import default_algos as def_algo
|
|
from mscclpp.language.collectives import *
|
|
from mscclpp.language.utils import AlgoSpec
|
|
|
|
default_algo_configs = [
|
|
{
|
|
"filename": "allreduce_2nodes_1K_64K.json",
|
|
"function": def_algo.allreduce_2nodes,
|
|
"spec": AlgoSpec(
|
|
name="allreduce_2nodes_1K_64K",
|
|
collective=AllReduce(16, 1, True),
|
|
nranks_per_node=8,
|
|
world_size=16,
|
|
in_place=True,
|
|
instances=1,
|
|
protocol="LL",
|
|
auto_sync=False,
|
|
num_threads_per_block=1024,
|
|
reuse_resources=True,
|
|
use_double_scratch_buffer=True,
|
|
min_message_size=1 << 10,
|
|
max_message_size=64 << 10,
|
|
tags={"default": 1},
|
|
),
|
|
"additional_kwargs": {"thread_block_group_size": 1},
|
|
},
|
|
{
|
|
"filename": "allreduce_2nodes_128K_2M.json",
|
|
"function": def_algo.allreduce_2nodes,
|
|
"spec": AlgoSpec(
|
|
name="allreduce_2nodes_128K_2M",
|
|
collective=AllReduce(16, 1, True),
|
|
nranks_per_node=8,
|
|
world_size=16,
|
|
in_place=True,
|
|
instances=1,
|
|
protocol="LL",
|
|
auto_sync=False,
|
|
num_threads_per_block=1024,
|
|
reuse_resources=True,
|
|
use_double_scratch_buffer=True,
|
|
min_message_size=128 << 10,
|
|
max_message_size=2 << 20,
|
|
tags={"default": 1},
|
|
),
|
|
"additional_kwargs": {"thread_block_group_size": 4},
|
|
},
|
|
]
|
|
|
|
|
|
def create_default_plans():
|
|
plan_dir = os.path.join(os.environ.get("MSCCLPP_CACHE_DIR", Path.home() / ".cache/mscclpp"), "default")
|
|
plan_path = Path(plan_dir)
|
|
if plan_path.exists():
|
|
shutil.rmtree(plan_path)
|
|
plan_path.mkdir(parents=True)
|
|
|
|
for config in default_algo_configs:
|
|
filename = config["filename"]
|
|
func = config["function"]
|
|
spec = config["spec"]
|
|
additional_kwargs = config.get("additional_kwargs", {})
|
|
plan_path = os.path.join(plan_dir, filename)
|
|
|
|
try:
|
|
if additional_kwargs:
|
|
prog = func(spec, **additional_kwargs)
|
|
else:
|
|
prog = func(spec)
|
|
|
|
with open(plan_path, "w", encoding="utf-8") as f:
|
|
f.write(prog.to_json())
|
|
f.flush()
|
|
|
|
except Exception as e:
|
|
print(f"Error creating plan for {spec.name}: {e}")
|
|
continue
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--install", action="store_true", help="flag to install default plans")
|
|
args = parser.parse_args()
|
|
|
|
if args.install:
|
|
create_default_plans()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|