diff --git a/.github/scripts/run_case_optimization.sh b/.github/scripts/run_case_optimization.sh index 21b6ff0b6f..bd54aedc05 100755 --- a/.github/scripts/run_case_optimization.sh +++ b/.github/scripts/run_case_optimization.sh @@ -13,13 +13,6 @@ if [ "$job_device" = "gpu" ] && [ "$ngpus" -eq 0 ]; then ngpus=1 fi -# Verify the venv Python interpreter exists (created by ./mfc.sh build) -if [ ! -x build/venv/bin/python3 ]; then - echo "ERROR: build/venv/bin/python3 not found." - echo "The MFC build venv may not have been created. Was the pre-build step successful?" - exit 1 -fi - benchmarks=( benchmarks/5eq_rk3_weno3_hllc/case.py benchmarks/viscous_weno5_sgb_acoustic/case.py @@ -28,6 +21,24 @@ benchmarks=( benchmarks/igr/case.py ) +# For Frontier/Frontier AMD: deps were fetched on the login node via --deps-only; +# build case-optimized binaries here on the compute node before running. +# For Phoenix: prebuild-case-optimization.sh already built everything in a prior SLURM job. +# Frontier Cray: -j 1 to work around CCE 19.0.0 IPA SIGSEGV +build_jobs=8 +if [ "$job_cluster" = "frontier" ]; then + build_jobs=1 +fi + +if [ "$job_cluster" != "phoenix" ]; then + echo "=== Building case-optimized binaries on compute node ===" + for case in "${benchmarks[@]}"; do + echo "--- Building: $case ---" + ./mfc.sh build -i "$case" --case-optimization $gpu_opts -j $build_jobs + done + echo "=== All case-optimized binaries built ===" +fi + passed=0 failed=0 failed_cases="" diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 7ce02c1e3f..d39831730d 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -68,7 +68,7 @@ jobs: flag: f device: gpu interface: acc - build_script: "bash .github/workflows/frontier/build.sh gpu acc bench" + build_script: "bash .github/workflows/frontier/build.sh gpu acc" - cluster: frontier name: Oak Ridge | Frontier (CCE) group: phoenix @@ -76,7 +76,7 @@ jobs: flag: f device: gpu interface: omp - build_script: "bash .github/workflows/frontier/build.sh gpu omp bench" + build_script: "bash .github/workflows/frontier/build.sh gpu omp" - cluster: frontier_amd name: Oak Ridge | Frontier (AMD) group: phoenix @@ -84,7 +84,7 @@ jobs: flag: famd device: gpu interface: omp - build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench" + build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp" continue-on-error: ${{ matrix.cluster == 'frontier' || matrix.cluster == 'frontier_amd' }} runs-on: group: ${{ matrix.group }} @@ -103,7 +103,7 @@ jobs: ref: master path: master - - name: Setup & Build + - name: Fetch Dependencies if: matrix.build_script != '' timeout-minutes: 150 run: | diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh index 66d77cfd99..9522e3a043 100644 --- a/.github/workflows/common/bench.sh +++ b/.github/workflows/common/bench.sh @@ -21,19 +21,24 @@ if [ "$job_cluster" = "phoenix" ]; then trap 'rm -rf "$currentdir" || true' EXIT fi -# --- Build (if not pre-built on login node) --- -# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. +# --- Build --- +# Phoenix builds everything inside SLURM (no login-node build step). +# Frontier/Frontier AMD: deps already fetched on login node via --deps-only; +# source code is built here on the compute node. # Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk). if [ "$job_cluster" = "phoenix" ]; then source .github/scripts/clean-build.sh clean_build fi -if [ ! -d "build" ]; then - source .github/scripts/retry-build.sh - retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1 +# Frontier Cray: -j 1 to work around CCE 19.0.0 IPA SIGSEGV +if [ "$job_cluster" = "frontier" ]; then + n_jobs=1 fi +source .github/scripts/retry-build.sh +retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1 + # --- Bench cluster flag --- if [ "$job_cluster" = "phoenix" ]; then bench_cluster="phoenix-bench" diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh index e155fd48f8..2733235549 100644 --- a/.github/workflows/common/test.sh +++ b/.github/workflows/common/test.sh @@ -21,29 +21,35 @@ if [ "$job_cluster" = "phoenix" ]; then trap 'rm -rf "$currentdir" || true' EXIT fi -# --- Build (if not pre-built on login node) --- -# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. -# Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh -# to avoid SIGILL from stale binaries compiled on a different microarchitecture. +# --- Build --- +# Phoenix builds everything inside SLURM (no login-node build step). +# Frontier/Frontier AMD: deps already fetched on login node via --deps-only; +# source code is built here on the compute node. +# Phoenix: always start fresh to avoid SIGILL from stale binaries compiled +# on a different microarchitecture. if [ "$job_cluster" = "phoenix" ]; then source .github/scripts/clean-build.sh clean_build fi -if [ ! -d "build" ]; then - source .github/scripts/retry-build.sh +source .github/scripts/retry-build.sh - # Phoenix: smoke-test the syscheck binary to catch architecture mismatches - # (SIGILL from binaries compiled on a different compute node). - validate_cmd="" - if [ "$job_cluster" = "phoenix" ]; then - validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1' - fi +# Phoenix: smoke-test the syscheck binary to catch architecture mismatches +# (SIGILL from binaries compiled on a different compute node). +validate_cmd="" +if [ "$job_cluster" = "phoenix" ]; then + validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1' +fi - RETRY_VALIDATE_CMD="$validate_cmd" \ - retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1 +# Frontier Cray: -j 1 to work around CCE 19.0.0 IPA SIGSEGV +build_jobs=8 +if [ "$job_cluster" = "frontier" ]; then + build_jobs=1 fi +RETRY_VALIDATE_CMD="$validate_cmd" \ + retry_build ./mfc.sh test -v --dry-run -j $build_jobs $build_opts || exit 1 + # --- GPU detection and thread count --- device_opts="" rdma_opts="" diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index 5bd40999d7..cd289ef074 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -14,7 +14,6 @@ esac job_device=$1 job_interface=$2 -run_bench=$3 source .github/scripts/gpu-opts.sh build_opts="$gpu_opts" @@ -24,8 +23,4 @@ source .github/scripts/clean-build.sh clean_build source .github/scripts/retry-build.sh -if [ "$run_bench" == "bench" ]; then - retry_build ./mfc.sh build -j 8 $build_opts || exit 1 -else - retry_build ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts || exit 1 -fi +retry_build ./mfc.sh build --deps-only -j 8 $build_opts || exit 1 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 90ad965c52..d40a44f04f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -400,12 +400,12 @@ jobs: echo "Coverage cache: none available — full test suite will run" fi - - name: Build (login node) + - name: Fetch Dependencies if: matrix.cluster != 'phoenix' timeout-minutes: 60 run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }} - - name: Test + - name: Build & Test run: bash .github/scripts/submit-slurm-job.sh .github/workflows/common/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} ${{ matrix.shard }} - name: Cancel SLURM Jobs @@ -486,15 +486,20 @@ jobs: - name: Clean stale output files run: rm -f *.out + - name: Fetch Dependencies + if: matrix.cluster != 'phoenix' + run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }} + - name: Pre-Build (SLURM) if: matrix.cluster == 'phoenix' run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh cpu ${{ matrix.interface }} ${{ matrix.cluster }} - - name: Pre-Build (login node) + - name: Build & Run Case-Optimization Tests if: matrix.cluster != 'phoenix' - run: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }} + run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} - name: Run Case-Optimization Tests + if: matrix.cluster == 'phoenix' run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} - name: Cancel SLURM Jobs diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index d6daf97bb6..9fed43c271 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -552,6 +552,12 @@ def __build_target(target: typing.Union[MFCTarget, str], case: input.MFCInputFil history.add(target.name) + # Dependencies are pinned to fixed versions. If already configured + # (built & installed by a prior --deps-only step), skip entirely + # to avoid re-entering the superbuild (which may access the network). + if target.isDependency and target.is_configured(case): + return + for dep in target.requires.compute(): # If we have already built and installed this target, # do not do so again. This can be inferred by whether @@ -594,6 +600,25 @@ def build(targets=None, case: input.MFCInputFile = None, history: typing.Set[str case = case or input.load(ARG("input"), ARG("--"), {}) case.validate_params() + if ARG("deps_only", False) and len(history) == 0: + all_deps = set() + for t in targets: + resolved = get_target(t) + for dep in resolved.requires.compute(): + all_deps.add(dep) + + cons.print(f"[bold]Fetch Dependencies | {format_list_to_string([d.name for d in all_deps], 'magenta', 'None')}[/bold]") + cons.print(no_indent=True) + + if not all_deps: + cons.print("[yellow]No dependencies to build for the requested targets.[/yellow]") + return + + for dep in all_deps: + __build_target(dep, case, history) + + return + if len(history) == 0: cons.print(__generate_header(case, targets)) cons.print(no_indent=True) diff --git a/toolchain/mfc/cli/commands.py b/toolchain/mfc/cli/commands.py index 85aab95031..e98003aa74 100644 --- a/toolchain/mfc/cli/commands.py +++ b/toolchain/mfc/cli/commands.py @@ -134,6 +134,13 @@ default=False, dest="case_optimization", ), + Argument( + name="deps-only", + help="Only fetch and build dependencies, do not build MFC targets.", + action=ArgAction.STORE_TRUE, + default=False, + dest="deps_only", + ), ], examples=[ Example("./mfc.sh build", "Build all default targets (CPU)"),