diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
index 9d73c394f2a..d30ecde166d 100755
--- a/.ci/scripts/export_model_artifact.sh
+++ b/.ci/scripts/export_model_artifact.sh
@@ -25,6 +25,7 @@ Arguments:
                  - nvidia/diar_streaming_sortformer_4spk-v2
                  - nvidia/parakeet-tdt
                  - facebook/dinov2-small-imagenet1k-1-layer
+                 - facebook/dinov3-vits16-pretrain-lvd1689m
 
   quant_name   Quantization type (optional, default: non-quantized)
                Options:
@@ -176,6 +177,14 @@ case "$HF_MODEL" in
     PREPROCESSOR_FEATURE_SIZE=""
     PREPROCESSOR_OUTPUT=""
     ;;
+  facebook/dinov3-vits16-pretrain-lvd1689m)
+    MODEL_NAME="dinov3"
+    TASK=""
+    MAX_SEQ_LEN=""
+    EXTRA_PIP=""
+    PREPROCESSOR_FEATURE_SIZE=""
+    PREPROCESSOR_OUTPUT=""
+    ;;
   mistralai/Voxtral-Mini-4B-Realtime-2602)
     MODEL_NAME="voxtral_realtime"
     TASK=""
@@ -186,7 +195,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, facebook/dinov3-vits16-pretrain-lvd1689m"
     exit 1
     ;;
 esac
@@ -319,6 +328,24 @@ if [ "$MODEL_NAME" = "dinov2" ]; then
   exit 0
 fi
 
+# DINOv3 uses a custom export script (random weights since classifier head is untrained)
+if [ "$MODEL_NAME" = "dinov3" ]; then
+  pip install -r examples/models/dinov3/install_requirements.txt
+
+  python -m executorch.examples.models.dinov3.export_dinov3 \
+      --backend "$DEVICE" \
+      --output-dir "${OUTPUT_DIR}" \
+      --random-weights
+
+  test -f "${OUTPUT_DIR}/model.pte"
+  if [ "$DEVICE" = "cuda" ] || [ "$DEVICE" = "cuda-windows" ]; then
+    test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
+  fi
+  ls -al "${OUTPUT_DIR}"
+  echo "::endgroup::"
+  exit 0
+fi
+
 # Voxtral Realtime uses a custom export script
 if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
   pip install safetensors huggingface_hub
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
index cb7785036d3..fffdc8f0564 100755
--- a/.ci/scripts/test_model_e2e.sh
+++ b/.ci/scripts/test_model_e2e.sh
@@ -25,6 +25,7 @@ Arguments:
                 - Qwen/Qwen3-0.6B
                 - nvidia/parakeet-tdt
                 - facebook/dinov2-small-imagenet1k-1-layer
+                - facebook/dinov3-vits16-pretrain-lvd1689m
                 - mistralai/Voxtral-Mini-4B-Realtime-2602
 
   quant_name  Quantization type (required)
@@ -204,6 +205,19 @@ case "$HF_MODEL" in
     IMAGE_URL="https://github.com/pytorch/hub/raw/master/images/dog.jpg"
     IMAGE_PATH=""
     ;;
+  facebook/dinov3-vits16-pretrain-lvd1689m)
+    MODEL_NAME="dinov3"
+    RUNNER_TARGET="dinov3_runner"
+    RUNNER_PATH="dinov3"
+    EXPECTED_OUTPUT="predictions"
+    PREPROCESSOR=""
+    TOKENIZER_URL=""
+    TOKENIZER_FILE=""
+    AUDIO_URL=""
+    AUDIO_FILE=""
+    IMAGE_URL="https://github.com/pytorch/hub/raw/master/images/dog.jpg"
+    IMAGE_PATH=""
+    ;;
   mistralai/Voxtral-Mini-4B-Realtime-2602)
     MODEL_NAME="voxtral_realtime"
     RUNNER_TARGET="voxtral_realtime_runner"
@@ -218,7 +232,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, facebook/dinov3-vits16-pretrain-lvd1689m"
     exit 1
     ;;
 esac
@@ -232,7 +246,7 @@ echo "::group::Prepare $MODEL_NAME Artifacts"
 
 
 # Download tokenizer files (skip for models that bundle tokenizer in export or do not use one)
-if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ] && [ "$MODEL_NAME" != "sortformer" ] && [ "$MODEL_NAME" != "dinov2" ]; then
+if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ] && [ "$MODEL_NAME" != "sortformer" ] && [ "$MODEL_NAME" != "dinov2" ] && [ "$MODEL_NAME" != "dinov3" ]; then
   if [ "$TOKENIZER_FILE" != "" ]; then
     curl -L $TOKENIZER_URL/$TOKENIZER_FILE -o $MODEL_DIR/$TOKENIZER_FILE
   else
@@ -341,6 +355,12 @@ EOF
       RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
     fi
     ;;
+  dinov3)
+    RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --image_path ${MODEL_DIR}/test_image.jpg"
+    if [ "$DEVICE" = "cuda" ]; then
+      RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
+    fi
+    ;;
   voxtral_realtime)
     RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR}/$PREPROCESSOR --audio_path ${MODEL_DIR}/$AUDIO_FILE --temperature 0"
     # Add CUDA data path if present
diff --git a/.ci/scripts/test_model_e2e_windows.ps1 b/.ci/scripts/test_model_e2e_windows.ps1
index 491971f339f..de5ce6cb33d 100644
--- a/.ci/scripts/test_model_e2e_windows.ps1
+++ b/.ci/scripts/test_model_e2e_windows.ps1
@@ -109,8 +109,21 @@ switch ($HfModel) {
         $imageUrl = "https://github.com/pytorch/hub/raw/master/images/dog.jpg"
         $imageFile = "test_image.jpg"
     }
+    "facebook/dinov3-vits16-pretrain-lvd1689m" {
+        $runnerTarget = "dinov3_runner"
+        $runnerPath = "dinov3"
+        $runnerPreset = "dinov3-cuda"
+        $expectedOutput = "predictions"
+        $preprocessor = ""
+        $tokenizerUrl = ""
+        $tokenizerFile = ""
+        $audioUrl = ""
+        $audioFile = ""
+        $imageUrl = "https://github.com/pytorch/hub/raw/master/images/dog.jpg"
+        $imageFile = "test_image.jpg"
+    }
     default {
-        throw "Unsupported model '$HfModel'. Supported: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer"
+        throw "Unsupported model '$HfModel'. Supported: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, facebook/dinov3-vits16-pretrain-lvd1689m"
     }
 }
 
@@ -244,6 +257,13 @@ try {
                 "--image_path", (Join-Path -Path $resolvedModelDir -ChildPath $imageFile)
             )
         }
+        "facebook/dinov3-vits16-pretrain-lvd1689m" {
+            $runnerArgs = @(
+                "--model_path", $modelPte,
+                "--data_path", $cudaBlob,
+                "--image_path", (Join-Path -Path $resolvedModelDir -ChildPath $imageFile)
+            )
+        }
     }
 
     $stdoutFile = Join-Path -Path $env:TEMP -ChildPath ("et_runner_stdout_{0}.log" -f ([Guid]::NewGuid().ToString("N")))
diff --git a/.github/workflows/cuda-windows.yml b/.github/workflows/cuda-windows.yml
index 1b33fa32f61..1cdc75f1fbb 100644
--- a/.github/workflows/cuda-windows.yml
+++ b/.github/workflows/cuda-windows.yml
@@ -50,6 +50,9 @@ jobs:
           - model_repo: "facebook"
             model_name: "dinov2-small-imagenet1k-1-layer"
             quant: "non-quantized"
+          - model_repo: "facebook"
+            model_name: "dinov3-vits16-pretrain-lvd1689m"
+            quant: "non-quantized"
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -86,8 +89,8 @@ jobs:
         PYTHON_EXECUTABLE=python ./install_executorch.sh
         echo "::endgroup::"
 
-        # Setup Huggingface only for models that need it (not dinov2)
-        if [ "${{ matrix.model_name }}" != "dinov2-small-imagenet1k-1-layer" ]; then
+        # Setup Huggingface only for models that need it (not dinov2 or dinov3)
+        if [ "${{ matrix.model_name }}" != "dinov2-small-imagenet1k-1-layer" ] && [ "${{ matrix.model_name }}" != "dinov3-vits16-pretrain-lvd1689m" ]; then
           echo "::group::Setup Huggingface"
           pip install -U "huggingface_hub[cli]<1.0" accelerate
           huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
@@ -131,6 +134,9 @@ jobs:
           - model_repo: "facebook"
             model_name: "dinov2-small-imagenet1k-1-layer"
             quant: "non-quantized"
+          - model_repo: "facebook"
+            model_name: "dinov3-vits16-pretrain-lvd1689m"
+            quant: "non-quantized"
     with:
       timeout: 240
       runner: windows.g5.4xlarge.nvidia.gpu
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index de2b2e78c02..0351718d2a9 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -153,6 +153,8 @@ jobs:
             name: "parakeet-tdt"
           - repo: "facebook"
             name: "dinov2-small-imagenet1k-1-layer"
+          - repo: "facebook"
+            name: "dinov3-vits16-pretrain-lvd1689m"
         quant:
           - "non-quantized"
           - "quantized-int4-tile-packed"
@@ -190,6 +192,15 @@ jobs:
               repo: "facebook"
               name: "dinov2-small-imagenet1k-1-layer"
             quant: "quantized-int4-weight-only"
+          # DINOv3 currently supports only non-quantized export
+          - model:
+              repo: "facebook"
+              name: "dinov3-vits16-pretrain-lvd1689m"
+            quant: "quantized-int4-tile-packed"
+          - model:
+              repo: "facebook"
+              name: "dinov3-vits16-pretrain-lvd1689m"
+            quant: "quantized-int4-weight-only"
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -209,8 +220,8 @@ jobs:
         ./install_executorch.sh
         echo "::endgroup::"
 
-        # Setup Huggingface only for models that need it (not parakeet or dinov2)
-        if [ "${{ matrix.model.name }}" != "parakeet-tdt" ] && [ "${{ matrix.model.name }}" != "dinov2-small-imagenet1k-1-layer" ]; then
+        # Setup Huggingface only for models that need it (not parakeet, dinov2, or dinov3)
+        if [ "${{ matrix.model.name }}" != "parakeet-tdt" ] && [ "${{ matrix.model.name }}" != "dinov2-small-imagenet1k-1-layer" ] && [ "${{ matrix.model.name }}" != "dinov3-vits16-pretrain-lvd1689m" ]; then
           echo "::group::Setup Huggingface"
           pip install -U "huggingface_hub[cli]<1.0" accelerate
           huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
@@ -248,6 +259,8 @@ jobs:
             name: "parakeet-tdt"
           - repo: "facebook"
             name: "dinov2-small-imagenet1k-1-layer"
+          - repo: "facebook"
+            name: "dinov3-vits16-pretrain-lvd1689m"
         quant:
           - "non-quantized"
           - "quantized-int4-tile-packed"
@@ -285,6 +298,15 @@ jobs:
               repo: "facebook"
               name: "dinov2-small-imagenet1k-1-layer"
             quant: "quantized-int4-weight-only"
+          # DINOv3 currently supports only non-quantized export
+          - model:
+              repo: "facebook"
+              name: "dinov3-vits16-pretrain-lvd1689m"
+            quant: "quantized-int4-tile-packed"
+          - model:
+              repo: "facebook"
+              name: "dinov3-vits16-pretrain-lvd1689m"
+            quant: "quantized-int4-weight-only"
     with:
       timeout: 90
       runner: linux.g5.4xlarge.nvidia.gpu
diff --git a/Makefile b/Makefile
index 6b8ea37e7b2..c00c2c0fea6 100644
--- a/Makefile
+++ b/Makefile
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal dinov2-cuda dinov2-cuda-debug dinov3-cuda dinov3-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -111,6 +111,8 @@ help:
 	@echo "  parakeet-metal      - Build Parakeet runner with Metal backend (macOS only)"
 	@echo "  dinov2-cuda         - Build DINOv2 runner with CUDA backend"
 	@echo "  dinov2-cuda-debug   - Build DINOv2 runner with CUDA backend (debug mode)"
+	@echo "  dinov3-cuda         - Build DINOv3 runner with CUDA backend"
+	@echo "  dinov3-cuda-debug   - Build DINOv3 runner with CUDA backend (debug mode)"
 	@echo "  sortformer-cuda     - Build Sortformer runner with CUDA backend"
 	@echo "  sortformer-cpu      - Build Sortformer runner with CPU backend"
 	@echo "  silero-vad-cpu      - Build Silero VAD runner with CPU backend"
@@ -239,6 +241,24 @@ dinov2-cuda-debug:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/dinov2/dinov2_runner"
 
+dinov3-cuda:
+	@echo "==> Building and installing ExecuTorch with CUDA..."
+	cmake --workflow --preset llm-release-cuda
+	@echo "==> Building DINOv3 runner with CUDA..."
+	cd examples/models/dinov3 && cmake --workflow --preset dinov3-cuda
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/dinov3/dinov3_runner"
+
+dinov3-cuda-debug:
+	@echo "==> Building and installing ExecuTorch with CUDA (debug mode)..."
+	cmake --workflow --preset llm-debug-cuda
+	@echo "==> Building DINOv3 runner with CUDA (debug mode)..."
+	cd examples/models/dinov3 && cmake --workflow --preset dinov3-cuda-debug
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/dinov3/dinov3_runner"
+
 sortformer-cuda:
 	@echo "==> Building and installing ExecuTorch with CUDA..."
 	cmake --workflow --preset llm-release-cuda
diff --git a/examples/models/dinov3/CMakeLists.txt b/examples/models/dinov3/CMakeLists.txt
new file mode 100644
index 00000000000..53558e8fa23
--- /dev/null
+++ b/examples/models/dinov3/CMakeLists.txt
@@ -0,0 +1,88 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.24)
+project(dinov3_runner)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+
+# Let files say "include <executorch/path/to/header.h>"
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+# Need this for gflags
+set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
+find_package(gflags REQUIRED)
+
+# Find executorch libraries
+list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
+find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
+get_target_property(_executorch_imported executorch IMPORTED)
+if(NOT _executorch_imported)
+  executorch_target_link_options_shared_lib(executorch)
+endif()
+
+set(link_libraries executorch gflags)
+
+# Common ops
+if(TARGET optimized_native_cpu_ops_lib)
+  list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
+  get_target_property(_is_imported optimized_native_cpu_ops_lib IMPORTED)
+  if(NOT _is_imported)
+    executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+  endif()
+endif()
+
+# Add the required ExecuTorch extensions
+list(APPEND link_libraries extension_module extension_data_loader
+     extension_tensor extension_flat_tensor
+)
+
+# stb_image: lightweight library to load and resize images
+include(FetchContent)
+FetchContent_Declare(
+  stb
+  GIT_REPOSITORY https://github.com/nothings/stb.git
+  GIT_TAG f0569113c93ad095470c54bf34a17b36646bbbb5
+)
+FetchContent_MakeAvailable(stb)
+list(APPEND _common_include_directories ${stb_SOURCE_DIR}
+     ${stb_SOURCE_DIR}/deprecated
+)
+
+# Link CUDA backend
+find_package(CUDAToolkit REQUIRED)
+list(APPEND link_libraries aoti_cuda_backend)
+if(NOT MSVC)
+  executorch_target_link_options_shared_lib(aoti_cuda_backend)
+endif()
+
+add_executable(dinov3_runner main.cpp)
+if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+  target_link_options_gc_sections(dinov3_runner)
+  if(NOT MSVC)
+    target_link_options(dinov3_runner PRIVATE "LINKER:-s")
+  endif()
+endif()
+
+target_include_directories(dinov3_runner PUBLIC ${_common_include_directories})
+target_link_libraries(dinov3_runner PUBLIC ${link_libraries})
+target_compile_options(dinov3_runner PUBLIC ${_common_compile_options})
+
+# On Windows, copy required DLLs to the executable directory
+if(MSVC)
+  add_custom_command(
+    TARGET dinov3_runner
+    POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:aoti_cuda_shims>
+            $<TARGET_FILE_DIR:dinov3_runner>
+    COMMENT "Copying aoti_cuda_shims.dll to dinov3_runner directory"
+  )
+endif()
diff --git a/examples/models/dinov3/CMakePresets.json b/examples/models/dinov3/CMakePresets.json
new file mode 100644
index 00000000000..7e36c759fc2
--- /dev/null
+++ b/examples/models/dinov3/CMakePresets.json
@@ -0,0 +1,88 @@
+{
+    "version": 6,
+    "configurePresets": [
+        {
+            "name": "dinov3-base",
+            "hidden": true,
+            "binaryDir": "${sourceDir}/../../../cmake-out/examples/models/dinov3",
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "Release",
+                "CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out",
+                "CMAKE_PREFIX_PATH": "${sourceDir}/../../../cmake-out"
+            }
+        },
+        {
+            "name": "dinov3-cuda",
+            "displayName": "DINOv3 runner (CUDA)",
+            "inherits": ["dinov3-base"],
+            "cacheVariables": {
+                "EXECUTORCH_BUILD_CUDA": "ON"
+            },
+            "condition": {
+                "type": "inList",
+                "string": "${hostSystemName}",
+                "list": ["Linux", "Windows"]
+            }
+        },
+        {
+            "name": "dinov3-cuda-debug",
+            "displayName": "DINOv3 runner (CUDA, Debug)",
+            "inherits": ["dinov3-base"],
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "Debug",
+                "EXECUTORCH_BUILD_CUDA": "ON"
+            },
+            "condition": {
+                "type": "inList",
+                "string": "${hostSystemName}",
+                "list": ["Linux", "Windows"]
+            }
+        }
+    ],
+    "buildPresets": [
+        {
+            "name": "dinov3-cuda",
+            "displayName": "Build DINOv3 runner (CUDA)",
+            "configurePreset": "dinov3-cuda",
+            "configuration": "Release",
+            "targets": ["dinov3_runner"]
+        },
+        {
+            "name": "dinov3-cuda-debug",
+            "displayName": "Build DINOv3 runner (CUDA, Debug)",
+            "configurePreset": "dinov3-cuda-debug",
+            "configuration": "Debug",
+            "targets": ["dinov3_runner"]
+        }
+    ],
+    "workflowPresets": [
+        {
+            "name": "dinov3-cuda",
+            "displayName": "Configure and build DINOv3 runner (CUDA)",
+            "steps": [
+                {
+                    "type": "configure",
+                    "name": "dinov3-cuda"
+                },
+                {
+                    "type": "build",
+                    "name": "dinov3-cuda"
+                }
+            ]
+        },
+        {
+            "name": "dinov3-cuda-debug",
+            "displayName": "Configure and build DINOv3 runner (CUDA, Debug)",
+            "steps": [
+                {
+                    "type": "configure",
+                    "name": "dinov3-cuda-debug"
+                },
+                {
+                    "type": "build",
+                    "name": "dinov3-cuda-debug"
+                }
+            ]
+        }
+    ]
+}
diff --git a/examples/models/dinov3/__init__.py b/examples/models/dinov3/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/examples/models/dinov3/dog.jpg b/examples/models/dinov3/dog.jpg
new file mode 100644
index 00000000000..12f0e0dd116
Binary files /dev/null and b/examples/models/dinov3/dog.jpg differ
diff --git a/examples/models/dinov3/export_dinov3.py b/examples/models/dinov3/export_dinov3.py
new file mode 100644
index 00000000000..8a7590c425d
--- /dev/null
+++ b/examples/models/dinov3/export_dinov3.py
@@ -0,0 +1,207 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Export DINOv3 ViT image classification model for ExecuTorch with CUDA backend.
+
+DINOv3 ViT is a backbone model without a built-in classifier head, so this
+script wraps it with a linear classification layer for ImageNet-1k inference.
+
+Usage:
+    python examples/models/dinov3/export_dinov3.py \
+        --backend cuda --output-dir ./dinov3_exports
+
+    # With fp32 precision:
+    python examples/models/dinov3/export_dinov3.py \
+        --backend cuda --dtype fp32 --output-dir ./dinov3_exports
+
+    # For Windows CUDA:
+    python examples/models/dinov3/export_dinov3.py \
+        --backend cuda-windows --output-dir ./dinov3_exports
+"""
+
+import argparse
+import os
+
+import torch
+import torch.nn as nn
+from executorch.backends.cuda.cuda_backend import CudaBackend
+from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
+from executorch.exir import (
+    EdgeCompileConfig,
+    ExecutorchBackendConfig,
+    to_edge_transform_and_lower,
+)
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.passes import MemoryPlanningPass
+from transformers import DINOv3ViTConfig, DINOv3ViTModel
+
+
+class DINOv3ViTForImageClassification(nn.Module):
+    """Wrapper that adds a linear classifier head on top of DINOv3 ViT backbone."""
+
+    def __init__(self, backbone, num_classes=1000):
+        super().__init__()
+        self.backbone = backbone
+        self.classifier = nn.Linear(backbone.config.hidden_size, num_classes)
+
+    def forward(self, pixel_values):
+        outputs = self.backbone(pixel_values)
+        logits = self.classifier(outputs.pooler_output)
+        return logits
+
+
+def get_model(
+    model_name: str = "facebook/dinov3-vits16-pretrain-lvd1689m",
+    random_weights: bool = False,
+):
+    """Load and return the DINOv3 ViT model with a classification head in eval mode."""
+    if random_weights:
+        # Use default ViT-S/16 config without downloading from HuggingFace
+        config = DINOv3ViTConfig()
+        backbone = DINOv3ViTModel(config)
+    else:
+        backbone = DINOv3ViTModel.from_pretrained(model_name)
+    model = DINOv3ViTForImageClassification(backbone, num_classes=1000)
+    return model.eval()
+
+
+def export_model(model, sample_input, dtype=None):
+    """Export the model using torch.export."""
+    if dtype == torch.bfloat16:
+        model = model.to(dtype=torch.bfloat16)
+        sample_input = (sample_input[0].to(dtype=torch.bfloat16),)
+
+    exported = torch.export.export(model, sample_input, strict=False)
+    return exported
+
+
+def lower_to_executorch(exported_program, backend="cuda", metadata=None):
+    """Lower the exported program to ExecuTorch format with CUDA backend."""
+    from torch._inductor.decomposition import conv1d_to_conv2d
+
+    exported_program = exported_program.run_decompositions(
+        {torch.ops.aten.conv1d.default: conv1d_to_conv2d}
+    )
+
+    compile_specs = [CudaBackend.generate_method_name_compile_spec("forward")]
+    if backend == "cuda-windows":
+        compile_specs.append(CompileSpec("platform", "windows".encode("utf-8")))
+    partitioner = [CudaPartitioner(compile_specs)]
+
+    constant_methods = {}
+    if metadata:
+        for key, value in metadata.items():
+            constant_methods[key] = value
+
+    programs = {"forward": exported_program}
+    partitioner_dict = {"forward": partitioner}
+
+    et_prog = to_edge_transform_and_lower(
+        programs,
+        partitioner=partitioner_dict,
+        compile_config=EdgeCompileConfig(
+            _check_ir_validity=False,
+            _skip_dim_order=True,
+        ),
+        constant_methods=constant_methods if constant_methods else None,
+    )
+
+    return et_prog.to_executorch(
+        config=ExecutorchBackendConfig(
+            extract_delegate_segments=True,
+            memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
+            do_quant_fusion_and_const_prop=True,
+        ),
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Export DINOv3 ViT model for ExecuTorch CUDA backend"
+    )
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="facebook/dinov3-vits16-pretrain-lvd1689m",
+        help="HuggingFace model name for DINOv3 ViT",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="bf16",
+        choices=["bf16", "fp32"],
+        help="Data type for export (default: bf16, required for CUDA Triton SDPA)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="./dinov3_exports",
+        help="Output directory for exported artifacts",
+    )
+    parser.add_argument(
+        "--img-size",
+        type=int,
+        default=224,
+        help="Input image size (default: 224)",
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="cuda",
+        choices=["cuda", "cuda-windows"],
+        help="Backend to export for (default: cuda)",
+    )
+    parser.add_argument(
+        "--random-weights",
+        action="store_true",
+        help="Use random weights instead of pretrained (for pipeline testing)",
+    )
+    args = parser.parse_args()
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # Determine dtype
+    dtype = None
+    if args.dtype == "bf16":
+        dtype = torch.bfloat16
+
+    print(f"Loading DINOv3 ViT model: {args.model_name}")
+    model = get_model(args.model_name, random_weights=args.random_weights)
+
+    # Create sample input
+    sample_input = (torch.randn(1, 3, args.img_size, args.img_size),)
+    if dtype == torch.bfloat16:
+        sample_input = (sample_input[0].to(dtype=torch.bfloat16),)
+
+    print(f"Exporting model with torch.export (dtype={args.dtype or 'fp32'})...")
+    exported = export_model(model, sample_input, dtype=dtype)
+
+    # Metadata to embed in the .pte file
+    metadata = {
+        "get_img_size": args.img_size,
+        "get_num_classes": 1000,
+    }
+
+    print(f"Lowering to ExecuTorch with {args.backend} backend...")
+    et = lower_to_executorch(exported, backend=args.backend, metadata=metadata)
+
+    # Save the .pte file
+    pte_path = os.path.join(args.output_dir, "model.pte")
+    with open(pte_path, "wb") as f:
+        et.write_to_file(f)
+    print(f"Saved model to {pte_path}")
+
+    # Save tensor data (.ptd)
+    if et._tensor_data:
+        et.write_tensor_data_to_file(args.output_dir)
+        print(f"Saved tensor data to {args.output_dir}/")
+
+    print("Export complete!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/models/dinov3/install_requirements.txt b/examples/models/dinov3/install_requirements.txt
new file mode 100644
index 00000000000..8c167f1872e
--- /dev/null
+++ b/examples/models/dinov3/install_requirements.txt
@@ -0,0 +1,2 @@
+transformers
+torch
diff --git a/examples/models/dinov3/main.cpp b/examples/models/dinov3/main.cpp
new file mode 100644
index 00000000000..d2ea1519328
--- /dev/null
+++ b/examples/models/dinov3/main.cpp
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * DINOv3 ViT image classification runner for ExecuTorch.
+ *
+ * Usage:
+ *   ./dinov3_runner --model_path model.pte --data_path aoti_cuda_blob.ptd \
+ *                   --image_path image.jpg
+ */
+
+#include <algorithm>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#define STB_IMAGE_IMPLEMENTATION
+#include <stb_image.h>
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#include <stb_image_resize.h>
+
+#include <gflags/gflags.h>
+
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor_ptr.h>
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/platform/log.h>
+
+DEFINE_string(model_path, "model.pte", "Path to DINOv3 model (.pte).");
+DEFINE_string(
+    data_path,
+    "",
+    "Path to data file (.ptd) for CUDA delegate data.");
+DEFINE_string(
+    image_path,
+    "",
+    "Path to input image file (.jpg, .png, .bmp). "
+    "If empty, uses random input for testing.");
+DEFINE_int32(img_size, 224, "Input image size (default: 224).");
+DEFINE_int32(top_k, 5, "Number of top predictions to display (default: 5).");
+DEFINE_bool(
+    bf16,
+    true,
+    "Use bfloat16 input (default: true, matching export dtype).");
+
+using ::executorch::extension::from_blob;
+using ::executorch::extension::Module;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::EValue;
+
+namespace {
+
+// ImageNet normalization constants
+constexpr float kImageNetMean[] = {0.485f, 0.456f, 0.406f};
+constexpr float kImageNetStd[] = {0.229f, 0.224f, 0.225f};
+
+/**
+ * Load an image file, resize to target_size x target_size, and apply
+ * ImageNet normalization. Returns CHW float data.
+ */
+std::vector<float> load_image(const std::string& path, int target_size) {
+  int width, height, channels;
+  unsigned char* raw = stbi_load(path.c_str(), &width, &height, &channels, 3);
+  if (!raw) {
+    ET_LOG(Error, "Failed to load image: %s", path.c_str());
+    return {};
+  }
+
+  // Resize to target_size x target_size
+  std::vector<unsigned char> resized(target_size * target_size * 3);
+  stbir_resize_uint8(
+      raw, width, height, 0, resized.data(), target_size, target_size, 0, 3);
+  stbi_image_free(raw);
+
+  // Convert to CHW float with ImageNet normalization
+  size_t spatial = target_size * target_size;
+  std::vector<float> chw_data(3 * spatial);
+  for (int h = 0; h < target_size; ++h) {
+    for (int w = 0; w < target_size; ++w) {
+      int hwc_idx = (h * target_size + w) * 3;
+      for (int c = 0; c < 3; ++c) {
+        float pixel = static_cast<float>(resized[hwc_idx + c]) / 255.0f;
+        chw_data[c * spatial + h * target_size + w] =
+            (pixel - kImageNetMean[c]) / kImageNetStd[c];
+      }
+    }
+  }
+  return chw_data;
+}
+
+/**
+ * Generate random input data for testing.
+ */
+std::vector<float> generate_random_input(size_t size) {
+  std::vector<float> data(size);
+  for (size_t i = 0; i < size; ++i) {
+    data[i] = static_cast<float>(rand()) / RAND_MAX * 2.0f - 1.0f;
+  }
+  return data;
+}
+
+/**
+ * ImageNet-1k class labels (subset for display).
+ */
+const char* get_imagenet_label(int class_id) {
+  static const std::unordered_map<int, const char*> labels = {
+      {0, "tench"},
+      {1, "goldfish"},
+      {2, "great white shark"},
+      {6, "stingray"},
+      {15, "robin"},
+      {65, "sea snake"},
+      {99, "goose"},
+      {207, "golden retriever"},
+      {208, "Labrador retriever"},
+      {229, "Old English sheepdog"},
+      {232, "Border collie"},
+      {243, "bull mastiff"},
+      {258, "Samoyed"},
+      {281, "tabby cat"},
+      {282, "tiger cat"},
+      {283, "Persian cat"},
+      {285, "Egyptian cat"},
+      {291, "lion"},
+      {292, "tiger"},
+      {340, "zebra"},
+      {355, "llama"},
+      {360, "otter"},
+      {386, "African elephant"},
+      {388, "giant panda"},
+      {463, "bucket"},
+      {508, "computer keyboard"},
+      {530, "digital clock"},
+      {543, "drum"},
+      {620, "laptop"},
+      {717, "pickup truck"},
+      {751, "racket"},
+      {779, "school bus"},
+      {817, "sports car"},
+      {849, "teapot"},
+      {852, "tennis ball"},
+      {864, "tow truck"},
+      {895, "warplane"},
+      {920, "traffic light"},
+      {948, "Granny Smith"},
+      {950, "orange"},
+      {954, "banana"},
+      {963, "pizza"},
+  };
+  auto it = labels.find(class_id);
+  return it != labels.end() ? it->second : nullptr;
+}
+
+/**
+ * Print top-k predictions from logits.
+ */
+void print_top_k(const float* logits, int num_classes, int k) {
+  std::vector<int> indices(num_classes);
+  std::iota(indices.begin(), indices.end(), 0);
+
+  std::partial_sort(
+      indices.begin(),
+      indices.begin() + k,
+      indices.end(),
+      [logits](int a, int b) { return logits[a] > logits[b]; });
+
+  std::cout << "\nTop-" << k << " predictions:" << std::endl;
+  for (int i = 0; i < k && i < num_classes; ++i) {
+    int idx = indices[i];
+    const char* label = get_imagenet_label(idx);
+    if (label) {
+      std::cout << "  Class " << idx << " (" << label << "): " << logits[idx]
+                << std::endl;
+    } else {
+      std::cout << "  Class " << idx << ": " << logits[idx] << std::endl;
+    }
+  }
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  // Load model
+  std::unique_ptr<Module> model;
+  if (!FLAGS_data_path.empty()) {
+    model = std::make_unique<Module>(
+        FLAGS_model_path, FLAGS_data_path, Module::LoadMode::Mmap);
+  } else {
+    model = std::make_unique<Module>(FLAGS_model_path, Module::LoadMode::Mmap);
+  }
+
+  // Prepare input tensor
+  const int img_size = FLAGS_img_size;
+  const size_t input_size = 1 * 3 * img_size * img_size;
+
+  std::vector<float> input_data;
+  if (!FLAGS_image_path.empty()) {
+    input_data = load_image(FLAGS_image_path, img_size);
+    if (input_data.empty()) {
+      ET_LOG(Error, "Failed to load image");
+      return 1;
+    }
+  } else {
+    input_data = generate_random_input(input_size);
+  }
+
+  // Create input tensor: shape (1, 3, img_size, img_size)
+  std::vector<int32_t> input_shape = {1, 3, img_size, img_size};
+
+  // Convert to bf16 if needed (model is exported with bf16 by default)
+  std::vector<executorch::aten::BFloat16> bf16_data;
+  executorch::extension::TensorPtr input_tensor;
+  if (FLAGS_bf16) {
+    bf16_data.resize(input_size);
+    for (size_t i = 0; i < input_size; ++i) {
+      bf16_data[i] = executorch::aten::BFloat16(input_data[i]);
+    }
+    input_tensor = from_blob(
+        bf16_data.data(),
+        {input_shape.begin(), input_shape.end()},
+        executorch::aten::ScalarType::BFloat16);
+  } else {
+    input_tensor = from_blob(
+        input_data.data(),
+        {input_shape.begin(), input_shape.end()},
+        executorch::aten::ScalarType::Float);
+  }
+
+  // Run inference
+  std::vector<executorch::runtime::EValue> inputs;
+  inputs.push_back(*input_tensor);
+  auto result = model->execute("forward", inputs);
+
+  if (!result.ok()) {
+    ET_LOG(Error, "Inference failed with error: %d", (int)result.error());
+    return 1;
+  }
+
+  // Process output
+  auto& outputs = result.get();
+  if (outputs.empty()) {
+    ET_LOG(Error, "No outputs from model");
+    return 1;
+  }
+
+  auto& output_evalue = outputs[0];
+  if (!output_evalue.isTensor()) {
+    ET_LOG(Error, "Output is not a tensor");
+    return 1;
+  }
+
+  auto output_tensor = output_evalue.toTensor();
+  int num_classes = output_tensor.size(output_tensor.dim() - 1);
+
+  std::cout << "Output shape: (" << output_tensor.size(0) << ", " << num_classes
+            << ")" << std::endl;
+
+  // Convert output to float for top-k processing
+  std::vector<float> logits_float(num_classes);
+  if (output_tensor.scalar_type() == executorch::aten::ScalarType::BFloat16) {
+    const auto* bf16_ptr =
+        output_tensor.template const_data_ptr<executorch::aten::BFloat16>();
+    for (int i = 0; i < num_classes; ++i) {
+      logits_float[i] = static_cast<float>(bf16_ptr[i]);
+    }
+  } else {
+    const float* float_ptr = output_tensor.template const_data_ptr<float>();
+    for (int i = 0; i < num_classes; ++i) {
+      logits_float[i] = float_ptr[i];
+    }
+  }
+
+  // Print top-k predictions
+  print_top_k(logits_float.data(), num_classes, FLAGS_top_k);
+
+  return 0;
+}