diff --git a/.github/workflows/image.yaml b/.github/workflows/image.yaml index 1aa10dc18..d6826102c 100644 --- a/.github/workflows/image.yaml +++ b/.github/workflows/image.yaml @@ -198,5 +198,11 @@ jobs: PRECOMPILED: "true" DIST: signed_${{ matrix.dist }} run: | - source kernel_version.txt && \ + source kernel_version.txt + # arm64 does not support azure-fde (package linux-objects-nvidia-*-azure-fde not available for arm64) + if [[ "${{ matrix.dist }}" == "ubuntu24.04" ]] && [[ "${{ matrix.flavor }}" != "azure-fde" ]]; then + export DOCKER_BUILD_PLATFORM_OPTIONS="--platform=linux/amd64,linux/arm64" + else + export DOCKER_BUILD_PLATFORM_OPTIONS="--platform=linux/amd64" + fi make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver }} build-${DIST}-${DRIVER_VERSION} diff --git a/.github/workflows/precompiled.yaml b/.github/workflows/precompiled.yaml index ddf17633c..5c529fd37 100644 --- a/.github/workflows/precompiled.yaml +++ b/.github/workflows/precompiled.yaml @@ -25,9 +25,11 @@ jobs: runs-on: linux-amd64-cpu4 outputs: driver_branch: ${{ steps.extract_driver_branch.outputs.driver_branch }} + exclude_build_matrix_pairs: ${{ steps.extract_driver_branch.outputs.exclude_build_matrix_pairs }} kernel_flavors: ${{ steps.extract_driver_branch.outputs.kernel_flavors }} dist: ${{ steps.extract_driver_branch.outputs.dist }} lts_kernel: ${{ steps.extract_driver_branch.outputs.lts_kernel }} + platforms: ${{ steps.extract_driver_branch.outputs.platforms }} steps: - name: Checkout code uses: actions/checkout@v6 @@ -54,6 +56,14 @@ jobs: lts_kernel_json=$(printf '%s\n' "${LTS_KERNEL[@]}" | jq -R . | jq -cs .) echo "lts_kernel=$lts_kernel_json" >> $GITHUB_OUTPUT + # platforms for precompiled build (amd64 always; arm64 for ubuntu24.04) + PLATFORMS=("amd64" "arm64") + platforms_json=$(printf '%s\n' "${PLATFORMS[@]}" | jq -R . | jq -cs .) + echo "platforms=$platforms_json" >> $GITHUB_OUTPUT + + EXCLUDE_BUILD_MATRIX_PAIRS=("ubuntu24.04 535" "ubuntu24.04 570") + echo "exclude_build_matrix_pairs=$(printf '%s\n' "${EXCLUDE_BUILD_MATRIX_PAIRS[@]}" | jq -R . | jq -cs .)" >> $GITHUB_OUTPUT + precompiled-build-image: needs: set-driver-version-matrix runs-on: linux-amd64-cpu4 @@ -66,6 +76,8 @@ jobs: exclude: - dist: ubuntu24.04 driver_branch: 535 + - dist: ubuntu24.04 + driver_branch: 570 - lts_kernel: 5.15 dist: ubuntu24.04 - flavor: azure-fde @@ -82,7 +94,7 @@ jobs: GENERATE_ARTIFACTS="false" echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV - echo "BUILD_MULTI_ARCH_IMAGES=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV + echo "BUILD_MULTI_ARCH_IMAGES=true" >> $GITHUB_ENV - name: Set up QEMU uses: docker/setup-qemu-action@v3 @@ -106,6 +118,7 @@ jobs: elif [[ "${{ matrix.dist }}" == "ubuntu24.04" ]]; then BASE_TARGET="noble" fi + export DOCKER_BUILD_PLATFORM_OPTIONS="--platform=linux/amd64" make DRIVER_BRANCH=${{ matrix.driver_branch }} KERNEL_FLAVOR=${{ matrix.flavor }} LTS_KERNEL=${LTS_KERNEL} build-base-${BASE_TARGET} trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT @@ -121,10 +134,17 @@ jobs: PRECOMPILED: "true" DIST: signed_${{ matrix.dist }} run: | - source kernel_version.txt && \ - make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver_branch }} build-${DIST}-${DRIVER_VERSION} - - - name: Save base image, build image and kernel version file + source kernel_version.txt + export DRIVER_MULTI_ARCH_TAR="driver-images-${{ matrix.driver_branch }}-${KERNEL_VERSION}-${{ matrix.dist }}.tar" + export DOCKER_BUILD_OPTIONS="--output=type=oci,dest=./${DRIVER_MULTI_ARCH_TAR}" + # build multi-arch images for ubuntu24.04 except azure-fde , arm64 does not support azure-fde + if [[ "${{ matrix.dist }}" == "ubuntu24.04" ]] && [[ "${{ matrix.flavor }}" != "azure-fde" ]]; then + export DOCKER_BUILD_PLATFORM_OPTIONS="--platform=linux/amd64,linux/arm64" + else + export DOCKER_BUILD_PLATFORM_OPTIONS="--platform=linux/amd64" + fi + make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver_branch }} KERNEL_VERSION=${KERNEL_VERSION} build-${DIST}-${DRIVER_VERSION} + - name: Save base image and kernel version file env: DIST: ${{ matrix.dist }} PRIVATE_REGISTRY: "ghcr.io" @@ -139,8 +159,6 @@ jobs: tar -cvf kernel-version-${{ matrix.driver_branch }}-${KERNEL_VERSION}-${DIST}.tar kernel_version.txt docker save "${PRIVATE_REGISTRY}/nvidia/driver:base-${BASE_TARGET}-${LTS_KERNEL}-${{ matrix.flavor }}-${{ matrix.driver_branch }}" \ -o ./base-images-${{ matrix.driver_branch }}-${KERNEL_VERSION}-${DIST}.tar - docker save "${PRIVATE_REGISTRY}/nvidia/driver:${{ matrix.driver_branch }}-${KERNEL_VERSION}-${DIST}" \ - -o ./driver-images-${{ matrix.driver_branch }}-${KERNEL_VERSION}-${DIST}.tar # set env for artifacts upload echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV echo "DIST=$DIST" >> $GITHUB_ENV @@ -169,6 +187,8 @@ jobs: determine-e2e-test-matrix: runs-on: linux-amd64-cpu4 strategy: + fail-fast: false + max-parallel: 2 matrix: dist: ${{ fromJson(needs.set-driver-version-matrix.outputs.dist) }} lts_kernel: ${{ fromJson(needs.set-driver-version-matrix.outputs.lts_kernel) }} @@ -191,9 +211,9 @@ jobs: - name: Download all kernel-version artifacts uses: actions/download-artifact@v8 with: - pattern: kernel-version-* + pattern: kernel-version*${{ matrix.lts_kernel }}*${{ matrix.dist }} path: ./kernel-version-artifacts - merge-multiple: false + merge-multiple: true - name: Set kernel version env: @@ -204,30 +224,44 @@ jobs: kernel_flavors_json='${{ needs.set-driver-version-matrix.outputs.kernel_flavors }}' KERNEL_FLAVORS=($(echo "$kernel_flavors_json" | jq -r '.[]')) driver_branch_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}' - DRIVER_BRANCHES=($(echo "$driver_branch_json" | jq -r '.[]')) - - # remove 535 driver branch for ubuntu24.04 - if [ "$DIST" == "ubuntu24.04" ]; then - DRIVER_BRANCHES=($(for branch in "${DRIVER_BRANCHES[@]}"; do - [[ $branch != "535" ]] && echo "$branch" - done)) - fi + exclude_pairs_json='${{ needs.set-driver-version-matrix.outputs.exclude_build_matrix_pairs }}' + DRIVER_BRANCHES=() + for b in $(echo "$driver_branch_json" | jq -r '.[]'); do + pair="$DIST $b" + if ! echo "$exclude_pairs_json" | jq -r '.[]' | grep -qx "$pair"; then + DRIVER_BRANCHES+=("$b") + fi + done source ./tests/scripts/ci-precompiled-helpers.sh - KERNEL_VERSIONS=($(get_kernel_versions_to_test KERNEL_FLAVORS[@] DRIVER_BRANCHES[@] $DIST $LTS_KERNEL)) - if [ -z "$KERNEL_VERSIONS" ]; then - # no new kernel release - echo "Skipping e2e tests" - exit 0 - fi - # Convert array to JSON format and assign - echo "[]" > ./matrix_values_${{ matrix.dist }}_${{ matrix.lts_kernel }}.json - printf '%s\n' "${KERNEL_VERSIONS[@]}" | jq -R . | jq -s . > ./matrix_values_${{ matrix.dist }}_${{ matrix.lts_kernel }}.json + platforms_json='${{ needs.set-driver-version-matrix.outputs.platforms }}' + platform=$(echo "$platforms_json" | jq -r '.[]') + for PLATFORM in $platform; do + if [[ "$PLATFORM" == "arm64" ]] && [[ "$DIST" == "ubuntu22.04" ]]; then + continue + fi + if [[ "$PLATFORM" == "arm64" ]]; then + PLATFORM_SUFFIX="-arm64" + FLAVORS_FOR_PLATFORM=() + for f in "${KERNEL_FLAVORS[@]}"; do + if [[ "$f" != "azure-fde" ]]; then + FLAVORS_FOR_PLATFORM+=("$f") + fi + done + else + PLATFORM_SUFFIX="" + FLAVORS_FOR_PLATFORM=("${KERNEL_FLAVORS[@]}") + fi + KERNEL_VERSIONS=($(get_kernel_versions_to_test FLAVORS_FOR_PLATFORM[@] DRIVER_BRANCHES[@] $DIST $LTS_KERNEL $PLATFORM_SUFFIX)) + if [ -n "${KERNEL_VERSIONS[*]}" ]; then + printf '%s\n' "${KERNEL_VERSIONS[@]}" | jq -R . | jq -s . > ./matrix_values_${{ matrix.dist }}_${{ matrix.lts_kernel }}$PLATFORM_SUFFIX.json + fi + done - name: Upload kernel matrix values as artifacts uses: actions/upload-artifact@v7 with: name: matrix-values-${{ matrix.dist }}-${{ matrix.lts_kernel }} - path: ./matrix_values_${{ matrix.dist }}_${{ matrix.lts_kernel }}.json + path: ./matrix_values_${{ matrix.dist }}_${{ matrix.lts_kernel }}*.json retention-days: 1 collect-e2e-test-matrix: @@ -256,25 +290,13 @@ jobs: echo "matrix_values_not_empty=0" >> $GITHUB_OUTPUT kernel_versions=() - # Read and merge kernel_version values from dist files - dist_json='${{ needs.set-driver-version-matrix.outputs.dist }}' - DIST=($(echo "$dist_json" | jq -r '.[]')) - lts_kernel_json='${{ needs.set-driver-version-matrix.outputs.lts_kernel }}' - LTS_KERNEL=($(echo "$lts_kernel_json" | jq -r '.[]')) - for dist in "${DIST[@]}"; do - for kernel in "${LTS_KERNEL[@]}"; do - artifact_name="matrix-values-${dist}-${kernel}" - file_path="./matrix-values-artifacts/${artifact_name}/matrix_values_${dist}_${kernel}.json" - if [ -f "$file_path" ]; then - echo "Successfully found artifact: $artifact_name at $file_path" - value=$(jq -r '.[]' "$file_path") - kernel_versions+=($value) - echo "matrix_values_not_empty=1" >> $GITHUB_OUTPUT - else - echo "Artifact not found: $artifact_name" - fi - done + # Read and merge kernel_version values from all platform artifacts (amd64 and arm64) + for f in $(find ./matrix-values-artifacts -name "matrix_values_*.json" -type f 2>/dev/null); do + value=$(jq -r '.[]' "$f") + kernel_versions+=($value) + echo "matrix_values_not_empty=1" >> $GITHUB_OUTPUT done + kernel_versions=($(printf '%s\n' "${kernel_versions[@]}" | sort -u)) echo "Collected Kernel Versions: ${kernel_versions[@]}" combined_values=$(printf '%s\n' "${kernel_versions[@]}" | jq -R . | jq -s -c . | tr -d ' \n') echo "Combined Kernel Versions JSON: $combined_values" @@ -322,42 +344,57 @@ jobs: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Download all driver image artifacts - uses: actions/download-artifact@v8 - with: - pattern: driver-images-*-${{ matrix.kernel_version }} - path: ./tests/ - merge-multiple: true - name: Set and Calculate test vars run: | echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV KERNEL_VERSION="${{ matrix.kernel_version }}" + if [[ "$KERNEL_VERSION" == *-arm64 ]]; then + echo "PLATFORM=arm64" >> $GITHUB_ENV + KERNEL_VERSION="${KERNEL_VERSION%-arm64}" + else + echo "PLATFORM=amd64" >> $GITHUB_ENV + fi # Extract the last segment after the last dash DIST=${KERNEL_VERSION##*-} echo "DIST=$DIST" >> $GITHUB_ENV KERNEL_VERSION=${KERNEL_VERSION%-*} echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV - driver_branch_json="${{ needs.set-driver-version-matrix.outputs.driver_branch }}" - DRIVER_BRANCHES=($(echo "$driver_branch_json" | jq -r '.[]')) - echo "DRIVER_BRANCHES=${DRIVER_BRANCHES[*]}" >> $GITHUB_ENV - - name: Set kernel version in holodeck_${{ env.DIST }}.yaml + echo "DRIVER_BRANCHES=${{ needs.set-driver-version-matrix.outputs.driver_branch }}" >> $GITHUB_ENV + echo "exclude_pairs=${{ needs.set-driver-version-matrix.outputs.exclude_build_matrix_pairs }}" >> $GITHUB_ENV + - name: Download driver image artifacts + uses: actions/download-artifact@v7 + with: + pattern: driver-images-*-${{ env.KERNEL_VERSION }}-${{ env.DIST }}* + path: ./tests/ + merge-multiple: true + - name: Install skopeo run: | - yq eval '.spec += {"kernel": {"version": strenv(KERNEL_VERSION)}}' -i tests/holodeck_${{ env.DIST }}.yaml + sudo apt-get update && sudo apt-get install -y skopeo + - name: Configure Holodeck e2e test config (kernel, OS, instance) + run: | + yq eval '.spec += {"kernel": {"version": strenv(KERNEL_VERSION)}}' -i tests/holodeck_ubuntu.yaml + if [[ "$DIST" == "ubuntu24.04" ]]; then + yq eval '.spec.instance.os = "ubuntu-24.04"' -i tests/holodeck_ubuntu.yaml + fi + if [[ "$PLATFORM" == "arm64" ]]; then + yq eval '.spec.instance.image.architecture = strenv(PLATFORM)' -i tests/holodeck_ubuntu.yaml + yq eval '.spec.instance.type = "g5g.xlarge"' -i tests/holodeck_ubuntu.yaml + yq eval '.spec.instance.region = "us-west-2"' -i tests/holodeck_ubuntu.yaml + fi - name: Set up Holodeck - uses: NVIDIA/holodeck@v0.2.18 + uses: NVIDIA/holodeck@main env: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }} - DIST: ${{ env.DIST }} with: aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws_ssh_key: ${{ secrets.AWS_SSH_KEY }} - holodeck_config: "tests/holodeck_${{ env.DIST }}.yaml" + holodeck_config: "tests/holodeck_ubuntu.yaml" - name: Get public dns name id: get_public_dns_name uses: mikefarah/yq@master @@ -380,21 +417,21 @@ jobs: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | rc=0 - # for precompiled driver we are setting driver branch as driver version - DRIVER_BRANCHES=(${{ env.DRIVER_BRANCHES }}) - # remove 535 driver branch for ubuntu24.04 - if [ "$DIST" == "ubuntu24.04" ]; then - DRIVER_BRANCHES=($(for branch in "${DRIVER_BRANCHES[@]}"; do - [[ $branch != "535" ]] && echo "$branch" - done)) - fi + DRIVER_BRANCHES=($(echo '${{ env.DRIVER_BRANCHES }}' | jq -r '.[]')) + exclude_pairs='${{ env.exclude_pairs }}' for DRIVER_VERSION in "${DRIVER_BRANCHES[@]}"; do + pair="$DIST $DRIVER_VERSION" + if echo "$exclude_pairs" | jq -r '.[]' | grep -qx "$pair"; then + continue + fi echo "Running e2e for DRIVER_VERSION=$DRIVER_VERSION" status=0 TEST_CASE_ARGS="${GPU_OPERATOR_OPTIONS} --set driver.version=${DRIVER_VERSION}" # add escape character for space TEST_CASE_ARGS=$(printf '%q ' "$TEST_CASE_ARGS") IMAGE_PATH="./tests/driver-images-${DRIVER_VERSION}-${KERNEL_VERSION}-${DIST}.tar" + skopeo copy --override-os linux --override-arch "${PLATFORM}" "oci-archive:${IMAGE_PATH}" "docker-archive:./tests/tmp.tar:${PRIVATE_REGISTRY}/nvidia/driver:${DRIVER_VERSION}-${KERNEL_VERSION}-${DIST}" + mv "./tests/tmp.tar" "${IMAGE_PATH}" ./tests/ci-run-e2e.sh "${TEST_CASE}" "${TEST_CASE_ARGS}" ${IMAGE_PATH} || status=$? if [ $status -eq 1 ]; then echo "e2e validation failed for driver version $DRIVER_VERSION with status $status" @@ -419,64 +456,77 @@ jobs: - set-driver-version-matrix - collect-e2e-test-matrix - e2e-tests-nvidiadriver + env: + REGISTRY_AUTH_FILE: ${{ github.workspace }}/config.json strategy: + max-parallel: 5 matrix: driver_branch: ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_branch) }} kernel_version: ${{ fromJson(needs.collect-e2e-test-matrix.outputs.matrix_values) }} steps: - name: Check out code uses: actions/checkout@v6 - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - name: Set image vars + id: set_image_vars run: | echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV + KERNEL_VERSION="${{ matrix.kernel_version }}" + if [[ "$KERNEL_VERSION" == *-arm64 ]]; then + KERNEL_VERSION="${KERNEL_VERSION%-arm64}" + fi + echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV + DIST="${KERNEL_VERSION##*-}" + pair="$DIST ${{ matrix.driver_branch }}" + echo "run_publish=true" >> $GITHUB_OUTPUT + if echo '${{ needs.set-driver-version-matrix.outputs.exclude_build_matrix_pairs }}' | jq -r '.[]' | grep -qx "$pair"; then + echo "run_publish=false" >> $GITHUB_OUTPUT + fi + + - name: Install skopeo and login to GitHub Container Registry + run: | + sudo apt-get update && sudo apt-get install -y skopeo + mkdir -p "$(dirname "${REGISTRY_AUTH_FILE}")" + echo "${{ secrets.GITHUB_TOKEN }}" | skopeo login ${PRIVATE_REGISTRY} -u ${{ github.actor }} --password-stdin --authfile "${REGISTRY_AUTH_FILE}" - name: Download base image artifact - if: ${{ ! (matrix.driver_branch == 535 && contains(matrix.kernel_version, 'ubuntu24.04')) }} + if: steps.set_image_vars.outputs.run_publish == 'true' uses: actions/download-artifact@v8 with: - name: base-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }} + name: base-images-${{ matrix.driver_branch }}-${{ env.KERNEL_VERSION }} path: ./ - name: Publish base image - if: ${{ ! (matrix.driver_branch == 535 && contains(matrix.kernel_version, 'ubuntu24.04')) }} + if: steps.set_image_vars.outputs.run_publish == 'true' run: | - LTS_KERNEL=$(echo "${{ matrix.kernel_version }}" | sed -E 's/^([0-9]+\.[0-9]+)\..*/\1/') - KERNEL_FLAVOR=$(echo "${{ matrix.kernel_version }}" | sed -E 's/^[0-9]+\.[0-9]+\.[0-9]+-[0-9]+-(.*)-ubuntu[0-9]+\.[0-9]+$/\1/') - DIST=$(echo "${{ matrix.kernel_version }}" | sed -E 's/^.*-(ubuntu[0-9]+\.[0-9]+)$/\1/') + LTS_KERNEL=$(echo "${{ env.KERNEL_VERSION }}" | sed -E 's/^([0-9]+\.[0-9]+)\..*/\1/') + KERNEL_FLAVOR=$(echo "${{ env.KERNEL_VERSION }}" | sed -E 's/^[0-9]+\.[0-9]+\.[0-9]+-[0-9]+-(.*)-ubuntu[0-9]+\.[0-9]+$/\1/') + DIST=$(echo "${{ env.KERNEL_VERSION }}" | sed -E 's/^.*-(ubuntu[0-9]+\.[0-9]+)$/\1/') if [[ "${DIST}" == "ubuntu22.04" ]]; then BASE_TARGET="jammy" elif [[ "${DIST}" == "ubuntu24.04" ]]; then BASE_TARGET="noble" fi - image_path="./base-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }}.tar" + image_path="./base-images-${{ matrix.driver_branch }}-${{ env.KERNEL_VERSION }}.tar" echo "uploading $image_path" - docker load -i $image_path - docker push ${PRIVATE_REGISTRY}/nvidia/driver:base-${BASE_TARGET}-${LTS_KERNEL}-${KERNEL_FLAVOR}-${{ matrix.driver_branch }} + skopeo copy --authfile "${REGISTRY_AUTH_FILE}" "docker-archive:${image_path}" docker://${PRIVATE_REGISTRY}/nvidia/driver:base-${BASE_TARGET}-${LTS_KERNEL}-${KERNEL_FLAVOR}-${{ matrix.driver_branch }} - name: Download built image artifact - if: ${{ ! (matrix.driver_branch == 535 && contains(matrix.kernel_version, 'ubuntu24.04')) }} + if: steps.set_image_vars.outputs.run_publish == 'true' uses: actions/download-artifact@v8 with: - name: driver-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }} + name: driver-images-${{ matrix.driver_branch }}-${{ env.KERNEL_VERSION }} path: ./ - name: Publish image - if: ${{ ! (matrix.driver_branch == 535 && contains(matrix.kernel_version, 'ubuntu24.04')) }} + if: steps.set_image_vars.outputs.run_publish == 'true' run: | - image_path="./driver-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }}.tar" + image_path="./driver-images-${{ matrix.driver_branch }}-${{ env.KERNEL_VERSION }}.tar" echo "uploading $image_path" - docker load -i $image_path - docker push ${PRIVATE_REGISTRY}/nvidia/driver:${{ matrix.driver_branch }}-${{ matrix.kernel_version }} + skopeo copy --authfile "${REGISTRY_AUTH_FILE}" "oci-archive:${image_path}" docker://${PRIVATE_REGISTRY}/nvidia/driver:${{ matrix.driver_branch }}-${{ env.KERNEL_VERSION }} - name: Slack notification - if: ${{ ! (matrix.driver_branch == 535 && contains(matrix.kernel_version, 'ubuntu24.04')) }} + if: steps.set_image_vars.outputs.run_publish == 'true' uses: slackapi/slack-github-action@v2.1.1 with: token: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/Makefile b/Makefile index 5572cf13b..e5ef77674 100644 --- a/Makefile +++ b/Makefile @@ -160,12 +160,13 @@ build-%: DOCKERFILE = $(CURDIR)/$(SUBDIR)/Dockerfile # build-ubuntu20.04-$(DRIVER_VERSION) triggers a build for a specific $(DRIVER_VERSION) $(DISTRIBUTIONS): %: build-% $(BUILD_TARGETS): %: $(foreach driver_version, $(DRIVER_VERSIONS), $(addprefix %-, $(driver_version))) +DRIVER_BUILD_TAG = $(if $(findstring type=oci,$(DOCKER_BUILD_OPTIONS)),,--tag $(IMAGE)) $(DRIVER_BUILD_TARGETS): DOCKER_BUILDKIT=1 \ $(DOCKER) $(BUILDX) build --pull \ $(DOCKER_BUILD_OPTIONS) \ $(DOCKER_BUILD_PLATFORM_OPTIONS) \ - --tag $(IMAGE) \ + $(DRIVER_BUILD_TAG) \ --build-arg DRIVER_VERSION="$(DRIVER_VERSION)" \ --build-arg GOLANG_VERSION="$(GOLANG_VERSION)" \ --build-arg DRIVER_BRANCH="$(DRIVER_BRANCH)" \ @@ -215,6 +216,7 @@ $(BASE_BUILD_TARGETS): DOCKER_BUILDKIT=1 \ $(DOCKER) $(BUILDX) build --pull --no-cache \ $(DOCKER_BUILD_OPTIONS) \ + $(DOCKER_BUILD_PLATFORM_OPTIONS) \ --tag $(IMAGE) \ --target $(TARGET) \ --build-arg CUDA_VERSION="$(CUDA_VERSION)" \ diff --git a/base/generate-ci-config b/base/generate-ci-config index 94ba87943..490130235 100755 --- a/base/generate-ci-config +++ b/base/generate-ci-config @@ -16,7 +16,12 @@ export DEBIAN_FRONTEND=noninteractive apt-get update -y -qq > /dev/null # Generate a list of all kernel versions which have NVIDIA precompiled driver packages available. -SUPPORTED_KERNELS_LINUX_SIGNATURES_LIST=$(apt-cache search "linux-signatures-nvidia.*${LTS_KERNEL}.*${KERNEL_FLAVOR}" | awk '{print $1}' | sed -e "s/^.*${LTS_KERNEL}/${LTS_KERNEL}/" | sort -n -t'-' -k2| grep "${KERNEL_FLAVOR}$") +# linux-signatures-nvidia (secure boot signatures) is not available for arm64 +if [ "$(uname -m)" = "aarch64" ]; then + SUPPORTED_KERNELS_LINUX_LIST=$(apt-cache search "linux-objects-nvidia-${DRIVER_BRANCH}-server.*${LTS_KERNEL}.*${KERNEL_FLAVOR}" | awk '{print $1}' | grep -v "open" | sed -e "s/^.*${LTS_KERNEL}/${LTS_KERNEL}/" | sort -n -t'-' -k2) +else + SUPPORTED_KERNELS_LINUX_LIST=$(apt-cache search "linux-signatures-nvidia.*${LTS_KERNEL}.*${KERNEL_FLAVOR}" | awk '{print $1}' | sed -e "s/^.*${LTS_KERNEL}/${LTS_KERNEL}/" | sort -n -t'-' -k2| grep "${KERNEL_FLAVOR}$") +fi # Generate a list of all kernel versions which have linux-image packages available. SUPPORTED_KERNELS_LINUX_IMAGE_LIST=$(apt-cache search linux-image-${LTS_KERNEL}.*-${KERNEL_FLAVOR} | awk '{print $1}' | sed -e "s/^.*${LTS_KERNEL}/${LTS_KERNEL}/" | sort -n -t'-' -k2 | grep "${KERNEL_FLAVOR}$") @@ -26,7 +31,7 @@ DRIVER_VERSION=$(apt-cache show nvidia-utils-${DRIVER_BRANCH}-server |grep Versi # Get the latest kernel from linux-signatures-list and linux-images-list # As list is already sorted , compare the kernel version and find exact match # get the latest kernel version with tail -SK=$(grep -Fxf <(echo "$SUPPORTED_KERNELS_LINUX_SIGNATURES_LIST") <(echo "$SUPPORTED_KERNELS_LINUX_IMAGE_LIST") | tail -n1) +SK=$(grep -Fxf <(echo "$SUPPORTED_KERNELS_LINUX_LIST") <(echo "$SUPPORTED_KERNELS_LINUX_IMAGE_LIST") | tail -n1) # Write to file echo "export KERNEL_VERSION=$SK DRIVER_VERSION=$DRIVER_VERSION DRIVER_VERSIONS=$DRIVER_VERSION" > /var/kernel_version.txt diff --git a/multi-arch.mk b/multi-arch.mk index 263fa88fb..a9cf25035 100644 --- a/multi-arch.mk +++ b/multi-arch.mk @@ -27,5 +27,4 @@ $(DRIVER_PUSH_TARGETS): push-%: build-ubuntu18.04%: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64 build-signed_ubuntu20.04%: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64 build-signed_ubuntu22.04%: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64 -build-signed_ubuntu24.04%: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64 build-sles%: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64 diff --git a/tests/holodeck_ubuntu22.04.yaml b/tests/holodeck_ubuntu.yaml similarity index 64% rename from tests/holodeck_ubuntu22.04.yaml rename to tests/holodeck_ubuntu.yaml index 0e870a7da..d4244e718 100644 --- a/tests/holodeck_ubuntu22.04.yaml +++ b/tests/holodeck_ubuntu.yaml @@ -11,19 +11,9 @@ spec: instance: type: g4dn.xlarge region: us-west-1 - ingressIpRanges: - - 18.190.12.32/32 - - 3.143.46.93/32 - - 52.15.119.136/32 - - 35.155.108.162/32 - - 35.162.190.51/32 - - 54.201.61.24/32 - - 52.24.205.48/32 - - 44.235.4.62/32 - - 44.230.241.223/32 + os: ubuntu-22.04 image: architecture: amd64 - imageId: ami-0007a86be89339c9f containerRuntime: install: true name: containerd diff --git a/tests/holodeck_ubuntu24.04.yaml b/tests/holodeck_ubuntu24.04.yaml deleted file mode 100644 index 7e22a2361..000000000 --- a/tests/holodeck_ubuntu24.04.yaml +++ /dev/null @@ -1,34 +0,0 @@ -apiVersion: holodeck.nvidia.com/v1alpha1 -kind: Environment -metadata: - name: HOLODECK_NAME - description: "end-to-end test infrastructure" -spec: - provider: aws - auth: - keyName: cnt-ci - privateKey: HOLODECK_PRIVATE_KEY - instance: - type: g4dn.xlarge - region: us-west-1 - ingressIpRanges: - - 18.190.12.32/32 - - 3.143.46.93/32 - - 52.15.119.136/32 - - 35.155.108.162/32 - - 35.162.190.51/32 - - 54.201.61.24/32 - - 52.24.205.48/32 - - 44.235.4.62/32 - - 44.230.241.223/32 - image: - architecture: amd64 - imageId: ami-00271c85bf8a52b84 - containerRuntime: - install: true - name: containerd - kubernetes: - install: true - installer: kubeadm - version: v1.33.0 - crictlVersion: v1.33.0 diff --git a/tests/scripts/ci-precompiled-helpers.sh b/tests/scripts/ci-precompiled-helpers.sh index 3050049db..bce9b55f9 100644 --- a/tests/scripts/ci-precompiled-helpers.sh +++ b/tests/scripts/ci-precompiled-helpers.sh @@ -1,6 +1,6 @@ get_kernel_versions_to_test() { - if [[ "$#" -ne 4 ]]; then - echo " Error:$0 must be called with KERNEL_FLAVORS DRIVER_BRANCHES DIST LTS_KERNEL" >&2 + if [[ "$#" -lt 4 || "$#" -gt 5 ]]; then + echo " Error:$0 must be called with KERNEL_FLAVORS DRIVER_BRANCHES DIST LTS_KERNEL or KERNEL_FLAVORS DRIVER_BRANCHES DIST LTS_KERNEL PLATFORM_SUFFIX" >&2 exit 1 fi @@ -8,11 +8,11 @@ get_kernel_versions_to_test() { local -a DRIVER_BRANCHES=("${!2}") local DIST="$3" local LTS_KERNEL="$4" - + local PLATFORM_SUFFIX="$5" kernel_versions=() for kernel_flavor in "${KERNEL_FLAVORS[@]}"; do for DRIVER_BRANCH in "${DRIVER_BRANCHES[@]}"; do - source ./tests/scripts/findkernelversion.sh "${kernel_flavor}" "$DRIVER_BRANCH" "$DIST" "$LTS_KERNEL" >&2 + source ./tests/scripts/findkernelversion.sh "${kernel_flavor}" "$DRIVER_BRANCH" "$DIST" "$LTS_KERNEL" "$PLATFORM_SUFFIX" >&2 if [[ "$should_continue" == true ]]; then break fi @@ -25,7 +25,7 @@ get_kernel_versions_to_test() { # Remove duplicates kernel_versions=($(printf "%s\n" "${kernel_versions[@]}" | sort -u)) for i in "${!kernel_versions[@]}"; do - kernel_versions[$i]="${kernel_versions[$i]}-$DIST" + kernel_versions[$i]="${kernel_versions[$i]}-$DIST$PLATFORM_SUFFIX" done echo "${kernel_versions[@]}" } diff --git a/tests/scripts/findkernelversion.sh b/tests/scripts/findkernelversion.sh index 9731a39c7..d2f322c1c 100755 --- a/tests/scripts/findkernelversion.sh +++ b/tests/scripts/findkernelversion.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [[ $# -ne 4 ]]; then - echo " KERNEL_FLAVOR DRIVER_BRANCH DIST LTS_KERNEL are required" +if [[ $# -lt 4 || $# -gt 5 ]]; then + echo " KERNEL_FLAVOR DRIVER_BRANCH DIST LTS_KERNEL or KERNEL_FLAVOR DRIVER_BRANCH DIST LTS_KERNEL PLATFORM_SUFFIX are required" exit 1 fi @@ -9,6 +9,7 @@ export KERNEL_FLAVOR="${1}" export DRIVER_BRANCH="${2}" export DIST="${3}" export LTS_KERNEL="${4}" +export PLATFORM_SUFFIX="${5}" export REGCTL_VERSION=v0.7.1 mkdir -p bin @@ -18,24 +19,24 @@ export PATH=$(pwd)/bin:${PATH} # calculate kernel version of latest image prefix="kernel-version-${DRIVER_BRANCH}-${LTS_KERNEL}" -suffix="${kernel_flavor}-${DIST}" +suffix="${KERNEL_FLAVOR}-${DIST}" artifact_dir="./kernel-version-artifacts" -artifact=$(find "$artifact_dir" -maxdepth 1 -type d -name "${prefix}*-${suffix}" | head -1) -if [ -n "$artifact" ]; then - artifact_name=$(basename "$artifact") - if [ -f "$artifact/${artifact_name}.tar" ]; then - tar -xf "$artifact/${artifact_name}.tar" -C ./ - export $(grep -oP 'KERNEL_VERSION=[^ ]+' ./kernel_version.txt) - rm -f kernel_version.txt - fi +artifact_file=$(find "$artifact_dir" -maxdepth 1 -type f -name "${prefix}*-${suffix}.tar" | head -1) +if [ -n "$artifact_file" ]; then + tar -xf "$artifact_file" -C ./ + export $(grep -oE 'KERNEL_VERSION=[^ ]+' ./kernel_version.txt) + rm -f kernel_version.txt fi # calculate driver tag status_nvcr=0 status_ghcr=0 -regctl tag ls nvcr.io/nvidia/driver | grep "^${DRIVER_BRANCH}-${KERNEL_VERSION}-${DIST}$" || status_nvcr=$? -regctl tag ls ghcr.io/nvidia/driver | grep "^${DRIVER_BRANCH}-${KERNEL_VERSION}-${DIST}$" || status_ghcr=$? +PLATFORM=$(echo "${PLATFORM_SUFFIX}" | sed 's/-//') +[ -z "$PLATFORM" ] && PLATFORM=amd64 +regctl manifest inspect nvcr.io/nvidia/driver:${DRIVER_BRANCH}-${KERNEL_VERSION}-${DIST} --platform=linux/${PLATFORM} > /dev/null 2>&1; status_nvcr=$? +regctl manifest inspect ghcr.io/nvidia/driver:${DRIVER_BRANCH}-${KERNEL_VERSION}-${DIST} --platform=linux/${PLATFORM} > /dev/null 2>&1; status_ghcr=$? + if [[ $status_nvcr -eq 0 || $status_ghcr -eq 0 ]]; then export should_continue=false else diff --git a/ubuntu24.04/precompiled/Dockerfile b/ubuntu24.04/precompiled/Dockerfile index c36043c7d..7c5d48016 100644 --- a/ubuntu24.04/precompiled/Dockerfile +++ b/ubuntu24.04/precompiled/Dockerfile @@ -17,7 +17,8 @@ ENV NVIDIA_VISIBLE_DEVICES=void RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections -RUN dpkg --add-architecture i386 && \ +# Add i386 architecture only for amd64 builds (not available on ARM) +RUN if [ "$TARGETARCH" = "amd64" ]; then dpkg --add-architecture i386; fi && \ apt-get update && apt-get install -y --no-install-recommends \ apt-utils \ build-essential \ @@ -31,11 +32,12 @@ RUN dpkg --add-architecture i386 && \ pkg-config && \ rm -rf /var/lib/apt/lists/* -# Fetch GPG keys for CUDA repo -RUN apt-key del 3bf863cc && \ +# Fetch GPG keys for CUDA repo (architecture-specific) +RUN CUDA_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "sbsa" || echo "x86_64") && \ + apt-key del 3bf863cc && \ rm /etc/apt/sources.list.d/cuda.list && \ - curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/3bf863cc.pub | gpg --dearmor -o /etc/apt/keyrings/cuda.pub && \ - echo "deb [signed-by=/etc/apt/keyrings/cuda.pub] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64 /" > /etc/apt/sources.list.d/cuda.list + curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${CUDA_ARCH}/3bf863cc.pub | gpg --dearmor -o /etc/apt/keyrings/cuda.pub && \ + echo "deb [signed-by=/etc/apt/keyrings/cuda.pub] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${CUDA_ARCH} /" > /etc/apt/sources.list.d/cuda.list RUN usermod -o -u 0 -g 0 _apt diff --git a/ubuntu24.04/precompiled/local-repo.sh b/ubuntu24.04/precompiled/local-repo.sh index 48e18453d..804d5be28 100755 --- a/ubuntu24.04/precompiled/local-repo.sh +++ b/ubuntu24.04/precompiled/local-repo.sh @@ -68,7 +68,12 @@ download_driver_package_deps () { pushd ${LOCAL_REPO_DIR} download_apt_with_dep linux-objects-nvidia-${DRIVER_BRANCH}-server-${KERNEL_VERSION} - download_apt_with_dep linux-signatures-nvidia-${KERNEL_VERSION} + + # linux-signatures-nvidia (secure boot signatures) is not available for arm64 + if [ "$TARGETARCH" = "amd64" ]; then + download_apt_with_dep linux-signatures-nvidia-${KERNEL_VERSION} + fi + download_apt_with_dep linux-modules-nvidia-${DRIVER_BRANCH}-server-${KERNEL_VERSION} download_apt_with_dep linux-modules-nvidia-${DRIVER_BRANCH}-server-open-${KERNEL_VERSION} download_apt_with_dep nvidia-utils-${DRIVER_BRANCH}-server @@ -76,7 +81,10 @@ download_driver_package_deps () { download_apt_with_dep libnvidia-decode-${DRIVER_BRANCH}-server download_apt_with_dep libnvidia-extra-${DRIVER_BRANCH}-server download_apt_with_dep libnvidia-encode-${DRIVER_BRANCH}-server - download_apt_with_dep libnvidia-fbc1-${DRIVER_BRANCH}-server + # libnvidia-fbc1 (FrameBuffer Capture) is not available for arm64 + if [ "$TARGETARCH" = "amd64" ]; then + download_apt_with_dep libnvidia-fbc1-${DRIVER_BRANCH}-server + fi download_apt_with_dep libnvidia-gl-${DRIVER_BRANCH}-server fabricmanager_download diff --git a/ubuntu24.04/precompiled/nvidia-driver b/ubuntu24.04/precompiled/nvidia-driver index eb887b2d8..4c84ea61a 100755 --- a/ubuntu24.04/precompiled/nvidia-driver +++ b/ubuntu24.04/precompiled/nvidia-driver @@ -416,22 +416,37 @@ _install_driver() { nvidia-headless-no-dkms-${DRIVER_BRANCH}-server \ libnvidia-decode-${DRIVER_BRANCH}-server \ libnvidia-extra-${DRIVER_BRANCH}-server \ - libnvidia-encode-${DRIVER_BRANCH}-server \ - libnvidia-fbc1-${DRIVER_BRANCH}-server \ - libnvidia-gl-${DRIVER_BRANCH}-server + libnvidia-encode-${DRIVER_BRANCH}-server + # libnvidia-fbc1 (FrameBuffer Capture) is not available for arm64 + if [ "$TARGETARCH" = "amd64" ]; then + apt-get install -y --no-install-recommends libnvidia-fbc1-${DRIVER_BRANCH}-server + fi + apt-get install -y --no-install-recommends libnvidia-gl-${DRIVER_BRANCH}-server # Now install the precompiled kernel module packages signed by Canonical + # linux-signatures-nvidia (secure boot signatures) is not available for arm64 if [ "$KERNEL_TYPE" = "kernel-open" ]; then echo "Installing Open NVIDIA driver kernel modules..." - apt-get install --no-install-recommends -y \ - linux-signatures-nvidia-${KERNEL_VERSION} \ - linux-modules-nvidia-${DRIVER_BRANCH}-server-open-${KERNEL_VERSION} + if [ "$TARGETARCH" = "arm64" ]; then + apt-get install --no-install-recommends -y \ + linux-modules-nvidia-${DRIVER_BRANCH}-server-open-${KERNEL_VERSION} + else + apt-get install --no-install-recommends -y \ + linux-signatures-nvidia-${KERNEL_VERSION} \ + linux-modules-nvidia-${DRIVER_BRANCH}-server-open-${KERNEL_VERSION} + fi else echo "Installing Closed NVIDIA driver kernel modules..." - apt-get install --no-install-recommends -y \ - linux-objects-nvidia-${DRIVER_BRANCH}-server-${KERNEL_VERSION} \ - linux-signatures-nvidia-${KERNEL_VERSION} \ - linux-modules-nvidia-${DRIVER_BRANCH}-server-${KERNEL_VERSION} + if [ "$TARGETARCH" = "arm64" ]; then + apt-get install --no-install-recommends -y \ + linux-objects-nvidia-${DRIVER_BRANCH}-server-${KERNEL_VERSION} \ + linux-modules-nvidia-${DRIVER_BRANCH}-server-${KERNEL_VERSION} + else + apt-get install --no-install-recommends -y \ + linux-objects-nvidia-${DRIVER_BRANCH}-server-${KERNEL_VERSION} \ + linux-signatures-nvidia-${KERNEL_VERSION} \ + linux-modules-nvidia-${DRIVER_BRANCH}-server-${KERNEL_VERSION} + fi fi }