Files
mscclpp/test/deploy/run-remote.sh
Binyang Li be9126ca1b Fix run-remote.sh to support multi-command scripts (#770)
## Summary
- Fix `run-remote.sh` to correctly execute multi-command scripts (e.g.,
multiple `mpirun` calls)
- The old approach piped decoded script through `base64 -d | bash`,
which feeds the script via bash's **stdin**. When `mpirun` (or its child
processes) runs, it can consume the remaining stdin, causing bash to
never see subsequent commands — only the first command would execute.
- The fix decodes the script to a **temp file** and runs `bash -euxo
pipefail "$TMP"` instead, so bash reads commands from the file and
`mpirun` consuming stdin has no effect.
- Applied to both the docker path (pssh + docker exec) and the
non-docker path (pssh only).


🤖 Generated with [Claude Code](https://claude.com/claude-code)
2026-04-01 16:25:19 -07:00

111 lines
2.8 KiB
Bash
Executable File

#!/bin/bash
# Run a command on remote CI VMs via parallel-ssh.
# By default, runs inside the mscclpp-test docker container.
#
# Usage:
# run-remote.sh [OPTIONS] < <command_script>
#
# Options:
# --no-docker Run command directly on the host, not inside docker
# --no-log Don't tail the log file in the background
# --hostfile Override hostfile path (default: test/deploy/hostfile_ci)
# --host Run command on a single host (uses parallel-ssh -H)
# --user SSH user when using --host or custom hostfile
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
HOSTFILE="${SCRIPT_DIR}/hostfile_ci"
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath="${SSHKEYFILE_SECUREFILEPATH}"
USE_DOCKER=true
USE_LOG=true
TARGET_HOST=""
REMOTE_USER=""
usage() {
echo "Usage: $0 [--no-docker] [--no-log] [--hostfile <path>] [--host <name>] [--user <name>] < <command_script>" >&2
}
require_value() {
local opt="$1"
local val="$2"
if [ -z "$val" ]; then
echo "Missing value for ${opt}" >&2
exit 1
fi
}
while [[ "$1" == --* ]]; do
case "$1" in
--no-docker) USE_DOCKER=false; shift ;;
--no-log) USE_LOG=false; shift ;;
--hostfile)
require_value "--hostfile" "${2-}"
HOSTFILE="$2"
shift 2
;;
--host)
require_value "--host" "${2-}"
TARGET_HOST="$2"
shift 2
;;
--user)
require_value "--user" "${2-}"
REMOTE_USER="$2"
shift 2
;;
*) echo "Unknown option: $1" >&2; exit 1 ;;
esac
done
if [ $# -ne 0 ] || [ -t 0 ]; then
usage
exit 1
fi
CMD=$(cat)
if [ -z "$CMD" ]; then
usage
exit 1
fi
CMD_B64=$(printf '%s' "$CMD" | base64 | tr -d '\n')
PSSH_TARGET_ARGS=()
if [ -n "$TARGET_HOST" ]; then
PSSH_TARGET_ARGS=(-H "$TARGET_HOST")
else
PSSH_TARGET_ARGS=(-h "$HOSTFILE")
fi
PSSH_USER_ARGS=()
if [ -n "$REMOTE_USER" ]; then
PSSH_USER_ARGS=(-l "$REMOTE_USER")
fi
PSSH_COMMON=(
-t 0
"${PSSH_TARGET_ARGS[@]}"
"${PSSH_USER_ARGS[@]}"
-x "-i ${KeyFilePath}"
-O "$SSH_OPTION"
)
if $USE_DOCKER; then
INNER="set -euxo pipefail;"
INNER+=" cd /root/mscclpp;"
INNER+=" export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\\\$LD_LIBRARY_PATH;"
INNER+=" CMD_B64='${CMD_B64}';"
INNER+=" TMP=\\\$(mktemp);"
INNER+=" printf '%s' \\\"\\\$CMD_B64\\\" | base64 -d > \\\"\\\$TMP\\\";"
INNER+=" bash -euxo pipefail \\\"\\\$TMP\\\";"
INNER+=" rm -f \\\"\\\$TMP\\\""
parallel-ssh -i "${PSSH_COMMON[@]}" \
"sudo docker exec mscclpp-test bash -c \"${INNER}\""
else
parallel-ssh -i "${PSSH_COMMON[@]}" \
"set -euxo pipefail; CMD_B64='${CMD_B64}'; TMP=\$(mktemp); printf '%s' \"\$CMD_B64\" | base64 -d > \"\$TMP\"; bash -euxo pipefail \"\$TMP\"; rm -f \"\$TMP\""
fi