Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions .github/scripts/run_case_optimization.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,6 @@ if [ "$job_device" = "gpu" ] && [ "$ngpus" -eq 0 ]; then
ngpus=1
fi

# Verify the venv Python interpreter exists (created by ./mfc.sh build)
if [ ! -x build/venv/bin/python3 ]; then
echo "ERROR: build/venv/bin/python3 not found."
echo "The MFC build venv may not have been created. Was the pre-build step successful?"
exit 1
fi

benchmarks=(
benchmarks/5eq_rk3_weno3_hllc/case.py
benchmarks/viscous_weno5_sgb_acoustic/case.py
Expand All @@ -28,6 +21,18 @@ benchmarks=(
benchmarks/igr/case.py
)

# For Frontier/Frontier AMD: deps were fetched on the login node via --deps-only;
# build case-optimized binaries here on the compute node before running.
# For Phoenix: prebuild-case-optimization.sh already built everything in a prior SLURM job.
if [ "$job_cluster" != "phoenix" ]; then
echo "=== Building case-optimized binaries on compute node ==="
for case in "${benchmarks[@]}"; do
echo "--- Building: $case ---"
./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8
done
echo "=== All case-optimized binaries built ==="
fi

passed=0
failed=0
failed_cases=""
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,23 +68,23 @@ jobs:
flag: f
device: gpu
interface: acc
build_script: "bash .github/workflows/frontier/build.sh gpu acc bench"
build_script: "bash .github/workflows/frontier/build.sh gpu acc"
- cluster: frontier
name: Oak Ridge | Frontier (CCE)
group: phoenix
labels: frontier
flag: f
device: gpu
interface: omp
build_script: "bash .github/workflows/frontier/build.sh gpu omp bench"
build_script: "bash .github/workflows/frontier/build.sh gpu omp"
- cluster: frontier_amd
name: Oak Ridge | Frontier (AMD)
group: phoenix
labels: frontier
flag: famd
device: gpu
interface: omp
build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench"
build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp"
continue-on-error: ${{ matrix.cluster == 'frontier' || matrix.cluster == 'frontier_amd' }}
runs-on:
group: ${{ matrix.group }}
Expand All @@ -103,7 +103,7 @@ jobs:
ref: master
path: master

- name: Setup & Build
- name: Fetch Dependencies
if: matrix.build_script != ''
timeout-minutes: 150
run: |
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/common/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,18 @@ if [ "$job_cluster" = "phoenix" ]; then
trap 'rm -rf "$currentdir" || true' EXIT
fi

# --- Build (if not pre-built on login node) ---
# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
# --- Build ---
# Phoenix builds everything inside SLURM (no login-node build step).
# Frontier/Frontier AMD: deps already fetched on login node via --deps-only;
# source code is built here on the compute node.
# Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk).
if [ "$job_cluster" = "phoenix" ]; then
source .github/scripts/clean-build.sh
clean_build
fi

if [ ! -d "build" ]; then
source .github/scripts/retry-build.sh
retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1
fi
source .github/scripts/retry-build.sh
retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1

# --- Bench cluster flag ---
if [ "$job_cluster" = "phoenix" ]; then
Expand Down
30 changes: 15 additions & 15 deletions .github/workflows/common/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,29 +21,29 @@ if [ "$job_cluster" = "phoenix" ]; then
trap 'rm -rf "$currentdir" || true' EXIT
fi

# --- Build (if not pre-built on login node) ---
# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
# Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh
# to avoid SIGILL from stale binaries compiled on a different microarchitecture.
# --- Build ---
# Phoenix builds everything inside SLURM (no login-node build step).
# Frontier/Frontier AMD: deps already fetched on login node via --deps-only;
# source code is built here on the compute node.
# Phoenix: always start fresh to avoid SIGILL from stale binaries compiled
# on a different microarchitecture.
if [ "$job_cluster" = "phoenix" ]; then
source .github/scripts/clean-build.sh
clean_build
fi

if [ ! -d "build" ]; then
source .github/scripts/retry-build.sh
source .github/scripts/retry-build.sh

# Phoenix: smoke-test the syscheck binary to catch architecture mismatches
# (SIGILL from binaries compiled on a different compute node).
validate_cmd=""
if [ "$job_cluster" = "phoenix" ]; then
validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1'
fi

RETRY_VALIDATE_CMD="$validate_cmd" \
retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1
# Phoenix: smoke-test the syscheck binary to catch architecture mismatches
# (SIGILL from binaries compiled on a different compute node).
validate_cmd=""
if [ "$job_cluster" = "phoenix" ]; then
validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1'
fi

RETRY_VALIDATE_CMD="$validate_cmd" \
retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1

# --- GPU detection and thread count ---
device_opts=""
rdma_opts=""
Expand Down
7 changes: 1 addition & 6 deletions .github/workflows/frontier/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ esac

job_device=$1
job_interface=$2
run_bench=$3
source .github/scripts/gpu-opts.sh
build_opts="$gpu_opts"

Expand All @@ -24,8 +23,4 @@ source .github/scripts/clean-build.sh
clean_build

source .github/scripts/retry-build.sh
if [ "$run_bench" == "bench" ]; then
retry_build ./mfc.sh build -j 8 $build_opts || exit 1
else
retry_build ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts || exit 1
fi
retry_build ./mfc.sh build --deps-only -j 8 $build_opts || exit 1
13 changes: 9 additions & 4 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -400,12 +400,12 @@ jobs:
echo "Coverage cache: none available — full test suite will run"
fi

- name: Build (login node)
- name: Fetch Dependencies
if: matrix.cluster != 'phoenix'
timeout-minutes: 60
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}

- name: Test
- name: Build & Test
run: bash .github/scripts/submit-slurm-job.sh .github/workflows/common/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} ${{ matrix.shard }}

- name: Cancel SLURM Jobs
Expand Down Expand Up @@ -486,15 +486,20 @@ jobs:
- name: Clean stale output files
run: rm -f *.out

- name: Fetch Dependencies
if: matrix.cluster != 'phoenix'
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}

- name: Pre-Build (SLURM)
if: matrix.cluster == 'phoenix'
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh cpu ${{ matrix.interface }} ${{ matrix.cluster }}

- name: Pre-Build (login node)
- name: Build & Run Case-Optimization Tests
if: matrix.cluster != 'phoenix'
run: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }}
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}

- name: Run Case-Optimization Tests
if: matrix.cluster == 'phoenix'
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}

- name: Cancel SLURM Jobs
Expand Down
25 changes: 25 additions & 0 deletions toolchain/mfc/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,12 @@ def __build_target(target: typing.Union[MFCTarget, str], case: input.MFCInputFil

history.add(target.name)

# Dependencies are pinned to fixed versions. If already configured
# (built & installed by a prior --deps-only step), skip entirely
# to avoid re-entering the superbuild (which may access the network).
if target.isDependency and target.is_configured(case):
return

for dep in target.requires.compute():
# If we have already built and installed this target,
# do not do so again. This can be inferred by whether
Expand Down Expand Up @@ -594,6 +600,25 @@ def build(targets=None, case: input.MFCInputFile = None, history: typing.Set[str
case = case or input.load(ARG("input"), ARG("--"), {})
case.validate_params()

if ARG("deps_only", False) and len(history) == 0:
all_deps = set()
for t in targets:
resolved = get_target(t)
for dep in resolved.requires.compute():
all_deps.add(dep)

cons.print(f"[bold]Fetch Dependencies | {format_list_to_string([d.name for d in all_deps], 'magenta', 'None')}[/bold]")
cons.print(no_indent=True)

if not all_deps:
cons.print("[yellow]No dependencies to build for the requested targets.[/yellow]")
return

for dep in all_deps:
__build_target(dep, case, history)

return

if len(history) == 0:
cons.print(__generate_header(case, targets))
cons.print(no_indent=True)
Expand Down
7 changes: 7 additions & 0 deletions toolchain/mfc/cli/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,13 @@
default=False,
dest="case_optimization",
),
Argument(
name="deps-only",
help="Only fetch and build dependencies, do not build MFC targets.",
action=ArgAction.STORE_TRUE,
default=False,
dest="deps_only",
),
],
examples=[
Example("./mfc.sh build", "Build all default targets (CPU)"),
Expand Down
Loading