mirror of
https://github.com/huchenlei/Depth-Anything.git
synced 2026-01-26 15:29:46 +00:00
124 lines
3.2 KiB
Python
124 lines
3.2 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
# All rights reserved.
|
|
#
|
|
# This source code is licensed under the license found in the
|
|
# LICENSE file in the root directory of this source tree.
|
|
|
|
import argparse
|
|
import logging
|
|
import os
|
|
from pathlib import Path
|
|
from typing import List, Optional
|
|
|
|
import submitit
|
|
|
|
from dinov2.utils.cluster import (
|
|
get_slurm_executor_parameters,
|
|
get_slurm_partition,
|
|
get_user_checkpoint_path,
|
|
)
|
|
|
|
|
|
logger = logging.getLogger("dinov2")
|
|
|
|
|
|
def get_args_parser(
|
|
description: Optional[str] = None,
|
|
parents: Optional[List[argparse.ArgumentParser]] = None,
|
|
add_help: bool = True,
|
|
) -> argparse.ArgumentParser:
|
|
parents = parents or []
|
|
slurm_partition = get_slurm_partition()
|
|
parser = argparse.ArgumentParser(
|
|
description=description,
|
|
parents=parents,
|
|
add_help=add_help,
|
|
)
|
|
parser.add_argument(
|
|
"--ngpus",
|
|
"--gpus",
|
|
"--gpus-per-node",
|
|
default=8,
|
|
type=int,
|
|
help="Number of GPUs to request on each node",
|
|
)
|
|
parser.add_argument(
|
|
"--nodes",
|
|
"--nnodes",
|
|
default=2,
|
|
type=int,
|
|
help="Number of nodes to request",
|
|
)
|
|
parser.add_argument(
|
|
"--timeout",
|
|
default=2800,
|
|
type=int,
|
|
help="Duration of the job",
|
|
)
|
|
parser.add_argument(
|
|
"--partition",
|
|
default=slurm_partition,
|
|
type=str,
|
|
help="Partition where to submit",
|
|
)
|
|
parser.add_argument(
|
|
"--use-volta32",
|
|
action="store_true",
|
|
help="Request V100-32GB GPUs",
|
|
)
|
|
parser.add_argument(
|
|
"--comment",
|
|
default="",
|
|
type=str,
|
|
help="Comment to pass to scheduler, e.g. priority message",
|
|
)
|
|
parser.add_argument(
|
|
"--exclude",
|
|
default="",
|
|
type=str,
|
|
help="Nodes to exclude",
|
|
)
|
|
return parser
|
|
|
|
|
|
def get_shared_folder() -> Path:
|
|
user_checkpoint_path = get_user_checkpoint_path()
|
|
if user_checkpoint_path is None:
|
|
raise RuntimeError("Path to user checkpoint cannot be determined")
|
|
path = user_checkpoint_path / "experiments"
|
|
path.mkdir(exist_ok=True)
|
|
return path
|
|
|
|
|
|
def submit_jobs(task_class, args, name: str):
|
|
if not args.output_dir:
|
|
args.output_dir = str(get_shared_folder() / "%j")
|
|
|
|
Path(args.output_dir).mkdir(parents=True, exist_ok=True)
|
|
executor = submitit.AutoExecutor(folder=args.output_dir, slurm_max_num_timeout=30)
|
|
|
|
kwargs = {}
|
|
if args.use_volta32:
|
|
kwargs["slurm_constraint"] = "volta32gb"
|
|
if args.comment:
|
|
kwargs["slurm_comment"] = args.comment
|
|
if args.exclude:
|
|
kwargs["slurm_exclude"] = args.exclude
|
|
|
|
executor_params = get_slurm_executor_parameters(
|
|
nodes=args.nodes,
|
|
num_gpus_per_node=args.ngpus,
|
|
timeout_min=args.timeout, # max is 60 * 72
|
|
slurm_signal_delay_s=120,
|
|
slurm_partition=args.partition,
|
|
**kwargs,
|
|
)
|
|
executor.update_parameters(name=name, **executor_params)
|
|
|
|
task = task_class(args)
|
|
job = executor.submit(task)
|
|
|
|
logger.info(f"Submitted job_id: {job.job_id}")
|
|
str_output_dir = os.path.abspath(args.output_dir).replace("%j", str(job.job_id))
|
|
logger.info(f"Logs and checkpoints will be saved at: {str_output_dir}")
|