pytorch · Gasoonjia · Mar 16, 2026
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
@@ -25,6 +25,7 @@ Arguments:
                  - nvidia/diar_streaming_sortformer_4spk-v2
                  - nvidia/parakeet-tdt
                  - facebook/dinov2-small-imagenet1k-1-layer
+                 - facebook/dinov3-vits16-pretrain-lvd1689m
 
   quant_name   Quantization type (optional, default: non-quantized)
                Options:
@@ -176,6 +177,14 @@ case "$HF_MODEL" in
     PREPROCESSOR_FEATURE_SIZE=""
     PREPROCESSOR_OUTPUT=""
     ;;
+  facebook/dinov3-vits16-pretrain-lvd1689m)
+    MODEL_NAME="dinov3"
+    TASK=""
+    MAX_SEQ_LEN=""
+    EXTRA_PIP=""
+    PREPROCESSOR_FEATURE_SIZE=""
+    PREPROCESSOR_OUTPUT=""
+    ;;
   mistralai/Voxtral-Mini-4B-Realtime-2602)
     MODEL_NAME="voxtral_realtime"
     TASK=""
@@ -186,7 +195,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, facebook/dinov3-vits16-pretrain-lvd1689m"
     exit 1
     ;;
 esac
@@ -319,6 +328,24 @@ if [ "$MODEL_NAME" = "dinov2" ]; then
   exit 0
 fi
 
+# DINOv3 uses a custom export script (random weights since classifier head is untrained)
+if [ "$MODEL_NAME" = "dinov3" ]; then
+  pip install -r examples/models/dinov3/install_requirements.txt
+
+  python -m executorch.examples.models.dinov3.export_dinov3 \
+      --backend "$DEVICE" \
+      --output-dir "${OUTPUT_DIR}" \
+      --random-weights
+
+  test -f "${OUTPUT_DIR}/model.pte"
+  if [ "$DEVICE" = "cuda" ] || [ "$DEVICE" = "cuda-windows" ]; then
+    test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
+  fi
+  ls -al "${OUTPUT_DIR}"
+  echo "::endgroup::"
+  exit 0
+fi
+
 # Voxtral Realtime uses a custom export script
 if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
   pip install safetensors huggingface_hub

diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
@@ -25,6 +25,7 @@ Arguments:
                 - Qwen/Qwen3-0.6B
                 - nvidia/parakeet-tdt
                 - facebook/dinov2-small-imagenet1k-1-layer
+                - facebook/dinov3-vits16-pretrain-lvd1689m
                 - mistralai/Voxtral-Mini-4B-Realtime-2602
 
   quant_name  Quantization type (required)
@@ -204,6 +205,19 @@ case "$HF_MODEL" in
     IMAGE_URL="https://github.com/pytorch/hub/raw/master/images/dog.jpg"
     IMAGE_PATH=""
     ;;
+  facebook/dinov3-vits16-pretrain-lvd1689m)
+    MODEL_NAME="dinov3"
+    RUNNER_TARGET="dinov3_runner"
+    RUNNER_PATH="dinov3"
+    EXPECTED_OUTPUT="predictions"
+    PREPROCESSOR=""
+    TOKENIZER_URL=""
+    TOKENIZER_FILE=""
+    AUDIO_URL=""
+    AUDIO_FILE=""
+    IMAGE_URL="https://github.com/pytorch/hub/raw/master/images/dog.jpg"
+    IMAGE_PATH=""
+    ;;
   mistralai/Voxtral-Mini-4B-Realtime-2602)
     MODEL_NAME="voxtral_realtime"
     RUNNER_TARGET="voxtral_realtime_runner"
@@ -218,7 +232,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, facebook/dinov3-vits16-pretrain-lvd1689m"
     exit 1
     ;;
 esac
@@ -232,7 +246,7 @@ echo "::group::Prepare $MODEL_NAME Artifacts"
 
 
 # Download tokenizer files (skip for models that bundle tokenizer in export or do not use one)
-if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ] && [ "$MODEL_NAME" != "sortformer" ] && [ "$MODEL_NAME" != "dinov2" ]; then
+if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ] && [ "$MODEL_NAME" != "sortformer" ] && [ "$MODEL_NAME" != "dinov2" ] && [ "$MODEL_NAME" != "dinov3" ]; then
   if [ "$TOKENIZER_FILE" != "" ]; then
     curl -L $TOKENIZER_URL/$TOKENIZER_FILE -o $MODEL_DIR/$TOKENIZER_FILE
   else
@@ -341,6 +355,12 @@ EOF
       RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
     fi
     ;;
+  dinov3)
+    RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --image_path ${MODEL_DIR}/test_image.jpg"
+    if [ "$DEVICE" = "cuda" ]; then
+      RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
+    fi
+    ;;
   voxtral_realtime)
     RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR}/$PREPROCESSOR --audio_path ${MODEL_DIR}/$AUDIO_FILE --temperature 0"
     # Add CUDA data path if present

diff --git a/.ci/scripts/test_model_e2e_windows.ps1 b/.ci/scripts/test_model_e2e_windows.ps1
@@ -109,8 +109,21 @@ switch ($HfModel) {
         $imageUrl = "https://github.com/pytorch/hub/raw/master/images/dog.jpg"
         $imageFile = "test_image.jpg"
     }
+    "facebook/dinov3-vits16-pretrain-lvd1689m" {
+        $runnerTarget = "dinov3_runner"
+        $runnerPath = "dinov3"
+        $runnerPreset = "dinov3-cuda"
+        $expectedOutput = "predictions"
+        $preprocessor = ""
+        $tokenizerUrl = ""
+        $tokenizerFile = ""
+        $audioUrl = ""
+        $audioFile = ""
+        $imageUrl = "https://github.com/pytorch/hub/raw/master/images/dog.jpg"
+        $imageFile = "test_image.jpg"
+    }
     default {
-        throw "Unsupported model '$HfModel'. Supported: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer"
+        throw "Unsupported model '$HfModel'. Supported: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, facebook/dinov3-vits16-pretrain-lvd1689m"
     }
 }
 
@@ -244,6 +257,13 @@ try {
                 "--image_path", (Join-Path -Path $resolvedModelDir -ChildPath $imageFile)
             )
         }
+        "facebook/dinov3-vits16-pretrain-lvd1689m" {
+            $runnerArgs = @(
+                "--model_path", $modelPte,
+                "--data_path", $cudaBlob,
+                "--image_path", (Join-Path -Path $resolvedModelDir -ChildPath $imageFile)
+            )
+        }
     }
 
     $stdoutFile = Join-Path -Path $env:TEMP -ChildPath ("et_runner_stdout_{0}.log" -f ([Guid]::NewGuid().ToString("N")))

diff --git a/.github/workflows/cuda-windows.yml b/.github/workflows/cuda-windows.yml
@@ -50,6 +50,9 @@ jobs:
           - model_repo: "facebook"
             model_name: "dinov2-small-imagenet1k-1-layer"
             quant: "non-quantized"
+          - model_repo: "facebook"
+            model_name: "dinov3-vits16-pretrain-lvd1689m"
+            quant: "non-quantized"
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -86,8 +89,8 @@ jobs:
         PYTHON_EXECUTABLE=python ./install_executorch.sh
         echo "::endgroup::"
 
-        # Setup Huggingface only for models that need it (not dinov2)
-        if [ "${{ matrix.model_name }}" != "dinov2-small-imagenet1k-1-layer" ]; then
+        # Setup Huggingface only for models that need it (not dinov2 or dinov3)
+        if [ "${{ matrix.model_name }}" != "dinov2-small-imagenet1k-1-layer" ] && [ "${{ matrix.model_name }}" != "dinov3-vits16-pretrain-lvd1689m" ]; then
           echo "::group::Setup Huggingface"
           pip install -U "huggingface_hub[cli]<1.0" accelerate
           huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
@@ -131,6 +134,9 @@ jobs:
           - model_repo: "facebook"
             model_name: "dinov2-small-imagenet1k-1-layer"
             quant: "non-quantized"
+          - model_repo: "facebook"
+            model_name: "dinov3-vits16-pretrain-lvd1689m"
+            quant: "non-quantized"
     with:
       timeout: 240
       runner: windows.g5.4xlarge.nvidia.gpu

diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
@@ -153,6 +153,8 @@ jobs:
             name: "parakeet-tdt"
           - repo: "facebook"
             name: "dinov2-small-imagenet1k-1-layer"
+          - repo: "facebook"
+            name: "dinov3-vits16-pretrain-lvd1689m"
         quant:
           - "non-quantized"
           - "quantized-int4-tile-packed"
@@ -190,6 +192,15 @@ jobs:
               repo: "facebook"
               name: "dinov2-small-imagenet1k-1-layer"
             quant: "quantized-int4-weight-only"
+          # DINOv3 currently supports only non-quantized export
+          - model:
+              repo: "facebook"
+              name: "dinov3-vits16-pretrain-lvd1689m"
+            quant: "quantized-int4-tile-packed"
+          - model:
+              repo: "facebook"
+              name: "dinov3-vits16-pretrain-lvd1689m"
+            quant: "quantized-int4-weight-only"
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -209,8 +220,8 @@ jobs:
         ./install_executorch.sh
         echo "::endgroup::"
 
-        # Setup Huggingface only for models that need it (not parakeet or dinov2)
-        if [ "${{ matrix.model.name }}" != "parakeet-tdt" ] && [ "${{ matrix.model.name }}" != "dinov2-small-imagenet1k-1-layer" ]; then
+        # Setup Huggingface only for models that need it (not parakeet, dinov2, or dinov3)
+        if [ "${{ matrix.model.name }}" != "parakeet-tdt" ] && [ "${{ matrix.model.name }}" != "dinov2-small-imagenet1k-1-layer" ] && [ "${{ matrix.model.name }}" != "dinov3-vits16-pretrain-lvd1689m" ]; then
           echo "::group::Setup Huggingface"
           pip install -U "huggingface_hub[cli]<1.0" accelerate
           huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
@@ -248,6 +259,8 @@ jobs:
             name: "parakeet-tdt"
           - repo: "facebook"
             name: "dinov2-small-imagenet1k-1-layer"
+          - repo: "facebook"
+            name: "dinov3-vits16-pretrain-lvd1689m"
         quant:
           - "non-quantized"
           - "quantized-int4-tile-packed"
@@ -285,6 +298,15 @@ jobs:
               repo: "facebook"
               name: "dinov2-small-imagenet1k-1-layer"
             quant: "quantized-int4-weight-only"
+          # DINOv3 currently supports only non-quantized export
+          - model:
+              repo: "facebook"
+              name: "dinov3-vits16-pretrain-lvd1689m"
+            quant: "quantized-int4-tile-packed"
+          - model:
+              repo: "facebook"
+              name: "dinov3-vits16-pretrain-lvd1689m"
+            quant: "quantized-int4-weight-only"
     with:
       timeout: 90
       runner: linux.g5.4xlarge.nvidia.gpu

diff --git a/Makefile b/Makefile
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal dinov2-cuda dinov2-cuda-debug dinov3-cuda dinov3-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -111,6 +111,8 @@ help:
 	@echo "  parakeet-metal      - Build Parakeet runner with Metal backend (macOS only)"
 	@echo "  dinov2-cuda         - Build DINOv2 runner with CUDA backend"
 	@echo "  dinov2-cuda-debug   - Build DINOv2 runner with CUDA backend (debug mode)"
+	@echo "  dinov3-cuda         - Build DINOv3 runner with CUDA backend"
+	@echo "  dinov3-cuda-debug   - Build DINOv3 runner with CUDA backend (debug mode)"
 	@echo "  sortformer-cuda     - Build Sortformer runner with CUDA backend"
 	@echo "  sortformer-cpu      - Build Sortformer runner with CPU backend"
 	@echo "  silero-vad-cpu      - Build Silero VAD runner with CPU backend"
@@ -239,6 +241,24 @@ dinov2-cuda-debug:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/dinov2/dinov2_runner"
 
+dinov3-cuda:
+	@echo "==> Building and installing ExecuTorch with CUDA..."
+	cmake --workflow --preset llm-release-cuda
+	@echo "==> Building DINOv3 runner with CUDA..."
+	cd examples/models/dinov3 && cmake --workflow --preset dinov3-cuda
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/dinov3/dinov3_runner"
+
+dinov3-cuda-debug:
+	@echo "==> Building and installing ExecuTorch with CUDA (debug mode)..."
+	cmake --workflow --preset llm-debug-cuda
+	@echo "==> Building DINOv3 runner with CUDA (debug mode)..."
+	cd examples/models/dinov3 && cmake --workflow --preset dinov3-cuda-debug
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/dinov3/dinov3_runner"
+
 sortformer-cuda:
 	@echo "==> Building and installing ExecuTorch with CUDA..."
 	cmake --workflow --preset llm-release-cuda

@@ -0,0 +1,88 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.24)
+project(dinov3_runner)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+
+# Let files say "include <executorch/path/to/header.h>"
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+# Need this for gflags
+set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
+find_package(gflags REQUIRED)
+
+# Find executorch libraries
+list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
+find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
+get_target_property(_executorch_imported executorch IMPORTED)
+if(NOT _executorch_imported)
+  executorch_target_link_options_shared_lib(executorch)
+endif()
+
+set(link_libraries executorch gflags)
+
+# Common ops
+if(TARGET optimized_native_cpu_ops_lib)
+  list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
+  get_target_property(_is_imported optimized_native_cpu_ops_lib IMPORTED)
+  if(NOT _is_imported)
+    executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+  endif()
+endif()
+
+# Add the required ExecuTorch extensions
+list(APPEND link_libraries extension_module extension_data_loader
+     extension_tensor extension_flat_tensor
+)
+
+# stb_image: lightweight library to load and resize images
+include(FetchContent)
+FetchContent_Declare(
+  stb
+  GIT_REPOSITORY https://github.com/nothings/stb.git
+  GIT_TAG f0569113c93ad095470c54bf34a17b36646bbbb5
+)
+FetchContent_MakeAvailable(stb)
+list(APPEND _common_include_directories ${stb_SOURCE_DIR}
+     ${stb_SOURCE_DIR}/deprecated
+)
+
+# Link CUDA backend
+find_package(CUDAToolkit REQUIRED)
+list(APPEND link_libraries aoti_cuda_backend)
+if(NOT MSVC)
+  executorch_target_link_options_shared_lib(aoti_cuda_backend)
+endif()
+
+add_executable(dinov3_runner main.cpp)
+if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+  target_link_options_gc_sections(dinov3_runner)
+  if(NOT MSVC)
+    target_link_options(dinov3_runner PRIVATE "LINKER:-s")
+  endif()
+endif()
+
+target_include_directories(dinov3_runner PUBLIC ${_common_include_directories})
+target_link_libraries(dinov3_runner PUBLIC ${link_libraries})
+target_compile_options(dinov3_runner PUBLIC ${_common_compile_options})
+
+# On Windows, copy required DLLs to the executable directory
+if(MSVC)
+  add_custom_command(
+    TARGET dinov3_runner
+    POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:aoti_cuda_shims>
+            $<TARGET_FILE_DIR:dinov3_runner>
+    COMMENT "Copying aoti_cuda_shims.dll to dinov3_runner directory"
+  )
+endif()