diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh index 9d73c394f2a..d30ecde166d 100755 --- a/.ci/scripts/export_model_artifact.sh +++ b/.ci/scripts/export_model_artifact.sh @@ -25,6 +25,7 @@ Arguments: - nvidia/diar_streaming_sortformer_4spk-v2 - nvidia/parakeet-tdt - facebook/dinov2-small-imagenet1k-1-layer + - facebook/dinov3-vits16-pretrain-lvd1689m quant_name Quantization type (optional, default: non-quantized) Options: @@ -176,6 +177,14 @@ case "$HF_MODEL" in PREPROCESSOR_FEATURE_SIZE="" PREPROCESSOR_OUTPUT="" ;; + facebook/dinov3-vits16-pretrain-lvd1689m) + MODEL_NAME="dinov3" + TASK="" + MAX_SEQ_LEN="" + EXTRA_PIP="" + PREPROCESSOR_FEATURE_SIZE="" + PREPROCESSOR_OUTPUT="" + ;; mistralai/Voxtral-Mini-4B-Realtime-2602) MODEL_NAME="voxtral_realtime" TASK="" @@ -186,7 +195,7 @@ case "$HF_MODEL" in ;; *) echo "Error: Unsupported model '$HF_MODEL'" - echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer" + echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, facebook/dinov3-vits16-pretrain-lvd1689m" exit 1 ;; esac @@ -319,6 +328,24 @@ if [ "$MODEL_NAME" = "dinov2" ]; then exit 0 fi +# DINOv3 uses a custom export script (random weights since classifier head is untrained) +if [ "$MODEL_NAME" = "dinov3" ]; then + pip install -r examples/models/dinov3/install_requirements.txt + + python -m executorch.examples.models.dinov3.export_dinov3 \ + --backend "$DEVICE" \ + --output-dir "${OUTPUT_DIR}" \ + --random-weights + + test -f "${OUTPUT_DIR}/model.pte" + if [ "$DEVICE" = "cuda" ] || [ "$DEVICE" = "cuda-windows" ]; then + test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd" + fi + ls -al "${OUTPUT_DIR}" + echo "::endgroup::" + exit 0 +fi + # Voxtral Realtime uses a custom export script if [ "$MODEL_NAME" = "voxtral_realtime" ]; then pip install safetensors huggingface_hub diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh index cb7785036d3..fffdc8f0564 100755 --- a/.ci/scripts/test_model_e2e.sh +++ b/.ci/scripts/test_model_e2e.sh @@ -25,6 +25,7 @@ Arguments: - Qwen/Qwen3-0.6B - nvidia/parakeet-tdt - facebook/dinov2-small-imagenet1k-1-layer + - facebook/dinov3-vits16-pretrain-lvd1689m - mistralai/Voxtral-Mini-4B-Realtime-2602 quant_name Quantization type (required) @@ -204,6 +205,19 @@ case "$HF_MODEL" in IMAGE_URL="https://github.com/pytorch/hub/raw/master/images/dog.jpg" IMAGE_PATH="" ;; + facebook/dinov3-vits16-pretrain-lvd1689m) + MODEL_NAME="dinov3" + RUNNER_TARGET="dinov3_runner" + RUNNER_PATH="dinov3" + EXPECTED_OUTPUT="predictions" + PREPROCESSOR="" + TOKENIZER_URL="" + TOKENIZER_FILE="" + AUDIO_URL="" + AUDIO_FILE="" + IMAGE_URL="https://github.com/pytorch/hub/raw/master/images/dog.jpg" + IMAGE_PATH="" + ;; mistralai/Voxtral-Mini-4B-Realtime-2602) MODEL_NAME="voxtral_realtime" RUNNER_TARGET="voxtral_realtime_runner" @@ -218,7 +232,7 @@ case "$HF_MODEL" in ;; *) echo "Error: Unsupported model '$HF_MODEL'" - echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer" + echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, facebook/dinov3-vits16-pretrain-lvd1689m" exit 1 ;; esac @@ -232,7 +246,7 @@ echo "::group::Prepare $MODEL_NAME Artifacts" # Download tokenizer files (skip for models that bundle tokenizer in export or do not use one) -if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ] && [ "$MODEL_NAME" != "sortformer" ] && [ "$MODEL_NAME" != "dinov2" ]; then +if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ] && [ "$MODEL_NAME" != "sortformer" ] && [ "$MODEL_NAME" != "dinov2" ] && [ "$MODEL_NAME" != "dinov3" ]; then if [ "$TOKENIZER_FILE" != "" ]; then curl -L $TOKENIZER_URL/$TOKENIZER_FILE -o $MODEL_DIR/$TOKENIZER_FILE else @@ -341,6 +355,12 @@ EOF RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd" fi ;; + dinov3) + RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --image_path ${MODEL_DIR}/test_image.jpg" + if [ "$DEVICE" = "cuda" ]; then + RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd" + fi + ;; voxtral_realtime) RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR}/$PREPROCESSOR --audio_path ${MODEL_DIR}/$AUDIO_FILE --temperature 0" # Add CUDA data path if present diff --git a/.ci/scripts/test_model_e2e_windows.ps1 b/.ci/scripts/test_model_e2e_windows.ps1 index 491971f339f..de5ce6cb33d 100644 --- a/.ci/scripts/test_model_e2e_windows.ps1 +++ b/.ci/scripts/test_model_e2e_windows.ps1 @@ -109,8 +109,21 @@ switch ($HfModel) { $imageUrl = "https://github.com/pytorch/hub/raw/master/images/dog.jpg" $imageFile = "test_image.jpg" } + "facebook/dinov3-vits16-pretrain-lvd1689m" { + $runnerTarget = "dinov3_runner" + $runnerPath = "dinov3" + $runnerPreset = "dinov3-cuda" + $expectedOutput = "predictions" + $preprocessor = "" + $tokenizerUrl = "" + $tokenizerFile = "" + $audioUrl = "" + $audioFile = "" + $imageUrl = "https://github.com/pytorch/hub/raw/master/images/dog.jpg" + $imageFile = "test_image.jpg" + } default { - throw "Unsupported model '$HfModel'. Supported: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer" + throw "Unsupported model '$HfModel'. Supported: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, facebook/dinov3-vits16-pretrain-lvd1689m" } } @@ -244,6 +257,13 @@ try { "--image_path", (Join-Path -Path $resolvedModelDir -ChildPath $imageFile) ) } + "facebook/dinov3-vits16-pretrain-lvd1689m" { + $runnerArgs = @( + "--model_path", $modelPte, + "--data_path", $cudaBlob, + "--image_path", (Join-Path -Path $resolvedModelDir -ChildPath $imageFile) + ) + } } $stdoutFile = Join-Path -Path $env:TEMP -ChildPath ("et_runner_stdout_{0}.log" -f ([Guid]::NewGuid().ToString("N"))) diff --git a/.github/workflows/cuda-windows.yml b/.github/workflows/cuda-windows.yml index 1b33fa32f61..1cdc75f1fbb 100644 --- a/.github/workflows/cuda-windows.yml +++ b/.github/workflows/cuda-windows.yml @@ -50,6 +50,9 @@ jobs: - model_repo: "facebook" model_name: "dinov2-small-imagenet1k-1-layer" quant: "non-quantized" + - model_repo: "facebook" + model_name: "dinov3-vits16-pretrain-lvd1689m" + quant: "non-quantized" with: timeout: 90 secrets-env: EXECUTORCH_HF_TOKEN @@ -86,8 +89,8 @@ jobs: PYTHON_EXECUTABLE=python ./install_executorch.sh echo "::endgroup::" - # Setup Huggingface only for models that need it (not dinov2) - if [ "${{ matrix.model_name }}" != "dinov2-small-imagenet1k-1-layer" ]; then + # Setup Huggingface only for models that need it (not dinov2 or dinov3) + if [ "${{ matrix.model_name }}" != "dinov2-small-imagenet1k-1-layer" ] && [ "${{ matrix.model_name }}" != "dinov3-vits16-pretrain-lvd1689m" ]; then echo "::group::Setup Huggingface" pip install -U "huggingface_hub[cli]<1.0" accelerate huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN @@ -131,6 +134,9 @@ jobs: - model_repo: "facebook" model_name: "dinov2-small-imagenet1k-1-layer" quant: "non-quantized" + - model_repo: "facebook" + model_name: "dinov3-vits16-pretrain-lvd1689m" + quant: "non-quantized" with: timeout: 240 runner: windows.g5.4xlarge.nvidia.gpu diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index de2b2e78c02..0351718d2a9 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -153,6 +153,8 @@ jobs: name: "parakeet-tdt" - repo: "facebook" name: "dinov2-small-imagenet1k-1-layer" + - repo: "facebook" + name: "dinov3-vits16-pretrain-lvd1689m" quant: - "non-quantized" - "quantized-int4-tile-packed" @@ -190,6 +192,15 @@ jobs: repo: "facebook" name: "dinov2-small-imagenet1k-1-layer" quant: "quantized-int4-weight-only" + # DINOv3 currently supports only non-quantized export + - model: + repo: "facebook" + name: "dinov3-vits16-pretrain-lvd1689m" + quant: "quantized-int4-tile-packed" + - model: + repo: "facebook" + name: "dinov3-vits16-pretrain-lvd1689m" + quant: "quantized-int4-weight-only" with: timeout: 90 secrets-env: EXECUTORCH_HF_TOKEN @@ -209,8 +220,8 @@ jobs: ./install_executorch.sh echo "::endgroup::" - # Setup Huggingface only for models that need it (not parakeet or dinov2) - if [ "${{ matrix.model.name }}" != "parakeet-tdt" ] && [ "${{ matrix.model.name }}" != "dinov2-small-imagenet1k-1-layer" ]; then + # Setup Huggingface only for models that need it (not parakeet, dinov2, or dinov3) + if [ "${{ matrix.model.name }}" != "parakeet-tdt" ] && [ "${{ matrix.model.name }}" != "dinov2-small-imagenet1k-1-layer" ] && [ "${{ matrix.model.name }}" != "dinov3-vits16-pretrain-lvd1689m" ]; then echo "::group::Setup Huggingface" pip install -U "huggingface_hub[cli]<1.0" accelerate huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN @@ -248,6 +259,8 @@ jobs: name: "parakeet-tdt" - repo: "facebook" name: "dinov2-small-imagenet1k-1-layer" + - repo: "facebook" + name: "dinov3-vits16-pretrain-lvd1689m" quant: - "non-quantized" - "quantized-int4-tile-packed" @@ -285,6 +298,15 @@ jobs: repo: "facebook" name: "dinov2-small-imagenet1k-1-layer" quant: "quantized-int4-weight-only" + # DINOv3 currently supports only non-quantized export + - model: + repo: "facebook" + name: "dinov3-vits16-pretrain-lvd1689m" + quant: "quantized-int4-tile-packed" + - model: + repo: "facebook" + name: "dinov3-vits16-pretrain-lvd1689m" + quant: "quantized-int4-weight-only" with: timeout: 90 runner: linux.g5.4xlarge.nvidia.gpu diff --git a/Makefile b/Makefile index 6b8ea37e7b2..c00c2c0fea6 100644 --- a/Makefile +++ b/Makefile @@ -91,7 +91,7 @@ # # ============================================================================== -.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help +.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal dinov2-cuda dinov2-cuda-debug dinov3-cuda dinov3-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help help: @echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make \`. Available targets:" @@ -111,6 +111,8 @@ help: @echo " parakeet-metal - Build Parakeet runner with Metal backend (macOS only)" @echo " dinov2-cuda - Build DINOv2 runner with CUDA backend" @echo " dinov2-cuda-debug - Build DINOv2 runner with CUDA backend (debug mode)" + @echo " dinov3-cuda - Build DINOv3 runner with CUDA backend" + @echo " dinov3-cuda-debug - Build DINOv3 runner with CUDA backend (debug mode)" @echo " sortformer-cuda - Build Sortformer runner with CUDA backend" @echo " sortformer-cpu - Build Sortformer runner with CPU backend" @echo " silero-vad-cpu - Build Silero VAD runner with CPU backend" @@ -239,6 +241,24 @@ dinov2-cuda-debug: @echo "✓ Build complete!" @echo " Binary: cmake-out/examples/models/dinov2/dinov2_runner" +dinov3-cuda: + @echo "==> Building and installing ExecuTorch with CUDA..." + cmake --workflow --preset llm-release-cuda + @echo "==> Building DINOv3 runner with CUDA..." + cd examples/models/dinov3 && cmake --workflow --preset dinov3-cuda + @echo "" + @echo "✓ Build complete!" + @echo " Binary: cmake-out/examples/models/dinov3/dinov3_runner" + +dinov3-cuda-debug: + @echo "==> Building and installing ExecuTorch with CUDA (debug mode)..." + cmake --workflow --preset llm-debug-cuda + @echo "==> Building DINOv3 runner with CUDA (debug mode)..." + cd examples/models/dinov3 && cmake --workflow --preset dinov3-cuda-debug + @echo "" + @echo "✓ Build complete!" + @echo " Binary: cmake-out/examples/models/dinov3/dinov3_runner" + sortformer-cuda: @echo "==> Building and installing ExecuTorch with CUDA..." cmake --workflow --preset llm-release-cuda diff --git a/examples/models/dinov3/CMakeLists.txt b/examples/models/dinov3/CMakeLists.txt new file mode 100644 index 00000000000..53558e8fa23 --- /dev/null +++ b/examples/models/dinov3/CMakeLists.txt @@ -0,0 +1,88 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +cmake_minimum_required(VERSION 3.24) +project(dinov3_runner) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) + +include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) + +# Let files say "include " +set(_common_include_directories ${EXECUTORCH_ROOT}/..) + +# Need this for gflags +set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags) +find_package(gflags REQUIRED) + +# Find executorch libraries +list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..) +find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH) +get_target_property(_executorch_imported executorch IMPORTED) +if(NOT _executorch_imported) + executorch_target_link_options_shared_lib(executorch) +endif() + +set(link_libraries executorch gflags) + +# Common ops +if(TARGET optimized_native_cpu_ops_lib) + list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas) + get_target_property(_is_imported optimized_native_cpu_ops_lib IMPORTED) + if(NOT _is_imported) + executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib) + endif() +endif() + +# Add the required ExecuTorch extensions +list(APPEND link_libraries extension_module extension_data_loader + extension_tensor extension_flat_tensor +) + +# stb_image: lightweight library to load and resize images +include(FetchContent) +FetchContent_Declare( + stb + GIT_REPOSITORY https://github.com/nothings/stb.git + GIT_TAG f0569113c93ad095470c54bf34a17b36646bbbb5 +) +FetchContent_MakeAvailable(stb) +list(APPEND _common_include_directories ${stb_SOURCE_DIR} + ${stb_SOURCE_DIR}/deprecated +) + +# Link CUDA backend +find_package(CUDAToolkit REQUIRED) +list(APPEND link_libraries aoti_cuda_backend) +if(NOT MSVC) + executorch_target_link_options_shared_lib(aoti_cuda_backend) +endif() + +add_executable(dinov3_runner main.cpp) +if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") + target_link_options_gc_sections(dinov3_runner) + if(NOT MSVC) + target_link_options(dinov3_runner PRIVATE "LINKER:-s") + endif() +endif() + +target_include_directories(dinov3_runner PUBLIC ${_common_include_directories}) +target_link_libraries(dinov3_runner PUBLIC ${link_libraries}) +target_compile_options(dinov3_runner PUBLIC ${_common_compile_options}) + +# On Windows, copy required DLLs to the executable directory +if(MSVC) + add_custom_command( + TARGET dinov3_runner + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different $ + $ + COMMENT "Copying aoti_cuda_shims.dll to dinov3_runner directory" + ) +endif() diff --git a/examples/models/dinov3/CMakePresets.json b/examples/models/dinov3/CMakePresets.json new file mode 100644 index 00000000000..7e36c759fc2 --- /dev/null +++ b/examples/models/dinov3/CMakePresets.json @@ -0,0 +1,88 @@ +{ + "version": 6, + "configurePresets": [ + { + "name": "dinov3-base", + "hidden": true, + "binaryDir": "${sourceDir}/../../../cmake-out/examples/models/dinov3", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release", + "CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out", + "CMAKE_PREFIX_PATH": "${sourceDir}/../../../cmake-out" + } + }, + { + "name": "dinov3-cuda", + "displayName": "DINOv3 runner (CUDA)", + "inherits": ["dinov3-base"], + "cacheVariables": { + "EXECUTORCH_BUILD_CUDA": "ON" + }, + "condition": { + "type": "inList", + "string": "${hostSystemName}", + "list": ["Linux", "Windows"] + } + }, + { + "name": "dinov3-cuda-debug", + "displayName": "DINOv3 runner (CUDA, Debug)", + "inherits": ["dinov3-base"], + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Debug", + "EXECUTORCH_BUILD_CUDA": "ON" + }, + "condition": { + "type": "inList", + "string": "${hostSystemName}", + "list": ["Linux", "Windows"] + } + } + ], + "buildPresets": [ + { + "name": "dinov3-cuda", + "displayName": "Build DINOv3 runner (CUDA)", + "configurePreset": "dinov3-cuda", + "configuration": "Release", + "targets": ["dinov3_runner"] + }, + { + "name": "dinov3-cuda-debug", + "displayName": "Build DINOv3 runner (CUDA, Debug)", + "configurePreset": "dinov3-cuda-debug", + "configuration": "Debug", + "targets": ["dinov3_runner"] + } + ], + "workflowPresets": [ + { + "name": "dinov3-cuda", + "displayName": "Configure and build DINOv3 runner (CUDA)", + "steps": [ + { + "type": "configure", + "name": "dinov3-cuda" + }, + { + "type": "build", + "name": "dinov3-cuda" + } + ] + }, + { + "name": "dinov3-cuda-debug", + "displayName": "Configure and build DINOv3 runner (CUDA, Debug)", + "steps": [ + { + "type": "configure", + "name": "dinov3-cuda-debug" + }, + { + "type": "build", + "name": "dinov3-cuda-debug" + } + ] + } + ] +} diff --git a/examples/models/dinov3/__init__.py b/examples/models/dinov3/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/examples/models/dinov3/dog.jpg b/examples/models/dinov3/dog.jpg new file mode 100644 index 00000000000..12f0e0dd116 Binary files /dev/null and b/examples/models/dinov3/dog.jpg differ diff --git a/examples/models/dinov3/export_dinov3.py b/examples/models/dinov3/export_dinov3.py new file mode 100644 index 00000000000..8a7590c425d --- /dev/null +++ b/examples/models/dinov3/export_dinov3.py @@ -0,0 +1,207 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Export DINOv3 ViT image classification model for ExecuTorch with CUDA backend. + +DINOv3 ViT is a backbone model without a built-in classifier head, so this +script wraps it with a linear classification layer for ImageNet-1k inference. + +Usage: + python examples/models/dinov3/export_dinov3.py \ + --backend cuda --output-dir ./dinov3_exports + + # With fp32 precision: + python examples/models/dinov3/export_dinov3.py \ + --backend cuda --dtype fp32 --output-dir ./dinov3_exports + + # For Windows CUDA: + python examples/models/dinov3/export_dinov3.py \ + --backend cuda-windows --output-dir ./dinov3_exports +""" + +import argparse +import os + +import torch +import torch.nn as nn +from executorch.backends.cuda.cuda_backend import CudaBackend +from executorch.backends.cuda.cuda_partitioner import CudaPartitioner +from executorch.exir import ( + EdgeCompileConfig, + ExecutorchBackendConfig, + to_edge_transform_and_lower, +) +from executorch.exir.backend.compile_spec_schema import CompileSpec +from executorch.exir.passes import MemoryPlanningPass +from transformers import DINOv3ViTConfig, DINOv3ViTModel + + +class DINOv3ViTForImageClassification(nn.Module): + """Wrapper that adds a linear classifier head on top of DINOv3 ViT backbone.""" + + def __init__(self, backbone, num_classes=1000): + super().__init__() + self.backbone = backbone + self.classifier = nn.Linear(backbone.config.hidden_size, num_classes) + + def forward(self, pixel_values): + outputs = self.backbone(pixel_values) + logits = self.classifier(outputs.pooler_output) + return logits + + +def get_model( + model_name: str = "facebook/dinov3-vits16-pretrain-lvd1689m", + random_weights: bool = False, +): + """Load and return the DINOv3 ViT model with a classification head in eval mode.""" + if random_weights: + # Use default ViT-S/16 config without downloading from HuggingFace + config = DINOv3ViTConfig() + backbone = DINOv3ViTModel(config) + else: + backbone = DINOv3ViTModel.from_pretrained(model_name) + model = DINOv3ViTForImageClassification(backbone, num_classes=1000) + return model.eval() + + +def export_model(model, sample_input, dtype=None): + """Export the model using torch.export.""" + if dtype == torch.bfloat16: + model = model.to(dtype=torch.bfloat16) + sample_input = (sample_input[0].to(dtype=torch.bfloat16),) + + exported = torch.export.export(model, sample_input, strict=False) + return exported + + +def lower_to_executorch(exported_program, backend="cuda", metadata=None): + """Lower the exported program to ExecuTorch format with CUDA backend.""" + from torch._inductor.decomposition import conv1d_to_conv2d + + exported_program = exported_program.run_decompositions( + {torch.ops.aten.conv1d.default: conv1d_to_conv2d} + ) + + compile_specs = [CudaBackend.generate_method_name_compile_spec("forward")] + if backend == "cuda-windows": + compile_specs.append(CompileSpec("platform", "windows".encode("utf-8"))) + partitioner = [CudaPartitioner(compile_specs)] + + constant_methods = {} + if metadata: + for key, value in metadata.items(): + constant_methods[key] = value + + programs = {"forward": exported_program} + partitioner_dict = {"forward": partitioner} + + et_prog = to_edge_transform_and_lower( + programs, + partitioner=partitioner_dict, + compile_config=EdgeCompileConfig( + _check_ir_validity=False, + _skip_dim_order=True, + ), + constant_methods=constant_methods if constant_methods else None, + ) + + return et_prog.to_executorch( + config=ExecutorchBackendConfig( + extract_delegate_segments=True, + memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False), + do_quant_fusion_and_const_prop=True, + ), + ) + + +def main(): + parser = argparse.ArgumentParser( + description="Export DINOv3 ViT model for ExecuTorch CUDA backend" + ) + parser.add_argument( + "--model-name", + type=str, + default="facebook/dinov3-vits16-pretrain-lvd1689m", + help="HuggingFace model name for DINOv3 ViT", + ) + parser.add_argument( + "--dtype", + type=str, + default="bf16", + choices=["bf16", "fp32"], + help="Data type for export (default: bf16, required for CUDA Triton SDPA)", + ) + parser.add_argument( + "--output-dir", + type=str, + default="./dinov3_exports", + help="Output directory for exported artifacts", + ) + parser.add_argument( + "--img-size", + type=int, + default=224, + help="Input image size (default: 224)", + ) + parser.add_argument( + "--backend", + type=str, + default="cuda", + choices=["cuda", "cuda-windows"], + help="Backend to export for (default: cuda)", + ) + parser.add_argument( + "--random-weights", + action="store_true", + help="Use random weights instead of pretrained (for pipeline testing)", + ) + args = parser.parse_args() + + os.makedirs(args.output_dir, exist_ok=True) + + # Determine dtype + dtype = None + if args.dtype == "bf16": + dtype = torch.bfloat16 + + print(f"Loading DINOv3 ViT model: {args.model_name}") + model = get_model(args.model_name, random_weights=args.random_weights) + + # Create sample input + sample_input = (torch.randn(1, 3, args.img_size, args.img_size),) + if dtype == torch.bfloat16: + sample_input = (sample_input[0].to(dtype=torch.bfloat16),) + + print(f"Exporting model with torch.export (dtype={args.dtype or 'fp32'})...") + exported = export_model(model, sample_input, dtype=dtype) + + # Metadata to embed in the .pte file + metadata = { + "get_img_size": args.img_size, + "get_num_classes": 1000, + } + + print(f"Lowering to ExecuTorch with {args.backend} backend...") + et = lower_to_executorch(exported, backend=args.backend, metadata=metadata) + + # Save the .pte file + pte_path = os.path.join(args.output_dir, "model.pte") + with open(pte_path, "wb") as f: + et.write_to_file(f) + print(f"Saved model to {pte_path}") + + # Save tensor data (.ptd) + if et._tensor_data: + et.write_tensor_data_to_file(args.output_dir) + print(f"Saved tensor data to {args.output_dir}/") + + print("Export complete!") + + +if __name__ == "__main__": + main() diff --git a/examples/models/dinov3/install_requirements.txt b/examples/models/dinov3/install_requirements.txt new file mode 100644 index 00000000000..8c167f1872e --- /dev/null +++ b/examples/models/dinov3/install_requirements.txt @@ -0,0 +1,2 @@ +transformers +torch diff --git a/examples/models/dinov3/main.cpp b/examples/models/dinov3/main.cpp new file mode 100644 index 00000000000..d2ea1519328 --- /dev/null +++ b/examples/models/dinov3/main.cpp @@ -0,0 +1,290 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/** + * DINOv3 ViT image classification runner for ExecuTorch. + * + * Usage: + * ./dinov3_runner --model_path model.pte --data_path aoti_cuda_blob.ptd \ + * --image_path image.jpg + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define STB_IMAGE_IMPLEMENTATION +#include +#define STB_IMAGE_RESIZE_IMPLEMENTATION +#include + +#include + +#include +#include +#include +#include +#include +#include + +DEFINE_string(model_path, "model.pte", "Path to DINOv3 model (.pte)."); +DEFINE_string( + data_path, + "", + "Path to data file (.ptd) for CUDA delegate data."); +DEFINE_string( + image_path, + "", + "Path to input image file (.jpg, .png, .bmp). " + "If empty, uses random input for testing."); +DEFINE_int32(img_size, 224, "Input image size (default: 224)."); +DEFINE_int32(top_k, 5, "Number of top predictions to display (default: 5)."); +DEFINE_bool( + bf16, + true, + "Use bfloat16 input (default: true, matching export dtype)."); + +using ::executorch::extension::from_blob; +using ::executorch::extension::Module; +using ::executorch::runtime::Error; +using ::executorch::runtime::EValue; + +namespace { + +// ImageNet normalization constants +constexpr float kImageNetMean[] = {0.485f, 0.456f, 0.406f}; +constexpr float kImageNetStd[] = {0.229f, 0.224f, 0.225f}; + +/** + * Load an image file, resize to target_size x target_size, and apply + * ImageNet normalization. Returns CHW float data. + */ +std::vector load_image(const std::string& path, int target_size) { + int width, height, channels; + unsigned char* raw = stbi_load(path.c_str(), &width, &height, &channels, 3); + if (!raw) { + ET_LOG(Error, "Failed to load image: %s", path.c_str()); + return {}; + } + + // Resize to target_size x target_size + std::vector resized(target_size * target_size * 3); + stbir_resize_uint8( + raw, width, height, 0, resized.data(), target_size, target_size, 0, 3); + stbi_image_free(raw); + + // Convert to CHW float with ImageNet normalization + size_t spatial = target_size * target_size; + std::vector chw_data(3 * spatial); + for (int h = 0; h < target_size; ++h) { + for (int w = 0; w < target_size; ++w) { + int hwc_idx = (h * target_size + w) * 3; + for (int c = 0; c < 3; ++c) { + float pixel = static_cast(resized[hwc_idx + c]) / 255.0f; + chw_data[c * spatial + h * target_size + w] = + (pixel - kImageNetMean[c]) / kImageNetStd[c]; + } + } + } + return chw_data; +} + +/** + * Generate random input data for testing. + */ +std::vector generate_random_input(size_t size) { + std::vector data(size); + for (size_t i = 0; i < size; ++i) { + data[i] = static_cast(rand()) / RAND_MAX * 2.0f - 1.0f; + } + return data; +} + +/** + * ImageNet-1k class labels (subset for display). + */ +const char* get_imagenet_label(int class_id) { + static const std::unordered_map labels = { + {0, "tench"}, + {1, "goldfish"}, + {2, "great white shark"}, + {6, "stingray"}, + {15, "robin"}, + {65, "sea snake"}, + {99, "goose"}, + {207, "golden retriever"}, + {208, "Labrador retriever"}, + {229, "Old English sheepdog"}, + {232, "Border collie"}, + {243, "bull mastiff"}, + {258, "Samoyed"}, + {281, "tabby cat"}, + {282, "tiger cat"}, + {283, "Persian cat"}, + {285, "Egyptian cat"}, + {291, "lion"}, + {292, "tiger"}, + {340, "zebra"}, + {355, "llama"}, + {360, "otter"}, + {386, "African elephant"}, + {388, "giant panda"}, + {463, "bucket"}, + {508, "computer keyboard"}, + {530, "digital clock"}, + {543, "drum"}, + {620, "laptop"}, + {717, "pickup truck"}, + {751, "racket"}, + {779, "school bus"}, + {817, "sports car"}, + {849, "teapot"}, + {852, "tennis ball"}, + {864, "tow truck"}, + {895, "warplane"}, + {920, "traffic light"}, + {948, "Granny Smith"}, + {950, "orange"}, + {954, "banana"}, + {963, "pizza"}, + }; + auto it = labels.find(class_id); + return it != labels.end() ? it->second : nullptr; +} + +/** + * Print top-k predictions from logits. + */ +void print_top_k(const float* logits, int num_classes, int k) { + std::vector indices(num_classes); + std::iota(indices.begin(), indices.end(), 0); + + std::partial_sort( + indices.begin(), + indices.begin() + k, + indices.end(), + [logits](int a, int b) { return logits[a] > logits[b]; }); + + std::cout << "\nTop-" << k << " predictions:" << std::endl; + for (int i = 0; i < k && i < num_classes; ++i) { + int idx = indices[i]; + const char* label = get_imagenet_label(idx); + if (label) { + std::cout << " Class " << idx << " (" << label << "): " << logits[idx] + << std::endl; + } else { + std::cout << " Class " << idx << ": " << logits[idx] << std::endl; + } + } +} + +} // namespace + +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + + // Load model + std::unique_ptr model; + if (!FLAGS_data_path.empty()) { + model = std::make_unique( + FLAGS_model_path, FLAGS_data_path, Module::LoadMode::Mmap); + } else { + model = std::make_unique(FLAGS_model_path, Module::LoadMode::Mmap); + } + + // Prepare input tensor + const int img_size = FLAGS_img_size; + const size_t input_size = 1 * 3 * img_size * img_size; + + std::vector input_data; + if (!FLAGS_image_path.empty()) { + input_data = load_image(FLAGS_image_path, img_size); + if (input_data.empty()) { + ET_LOG(Error, "Failed to load image"); + return 1; + } + } else { + input_data = generate_random_input(input_size); + } + + // Create input tensor: shape (1, 3, img_size, img_size) + std::vector input_shape = {1, 3, img_size, img_size}; + + // Convert to bf16 if needed (model is exported with bf16 by default) + std::vector bf16_data; + executorch::extension::TensorPtr input_tensor; + if (FLAGS_bf16) { + bf16_data.resize(input_size); + for (size_t i = 0; i < input_size; ++i) { + bf16_data[i] = executorch::aten::BFloat16(input_data[i]); + } + input_tensor = from_blob( + bf16_data.data(), + {input_shape.begin(), input_shape.end()}, + executorch::aten::ScalarType::BFloat16); + } else { + input_tensor = from_blob( + input_data.data(), + {input_shape.begin(), input_shape.end()}, + executorch::aten::ScalarType::Float); + } + + // Run inference + std::vector inputs; + inputs.push_back(*input_tensor); + auto result = model->execute("forward", inputs); + + if (!result.ok()) { + ET_LOG(Error, "Inference failed with error: %d", (int)result.error()); + return 1; + } + + // Process output + auto& outputs = result.get(); + if (outputs.empty()) { + ET_LOG(Error, "No outputs from model"); + return 1; + } + + auto& output_evalue = outputs[0]; + if (!output_evalue.isTensor()) { + ET_LOG(Error, "Output is not a tensor"); + return 1; + } + + auto output_tensor = output_evalue.toTensor(); + int num_classes = output_tensor.size(output_tensor.dim() - 1); + + std::cout << "Output shape: (" << output_tensor.size(0) << ", " << num_classes + << ")" << std::endl; + + // Convert output to float for top-k processing + std::vector logits_float(num_classes); + if (output_tensor.scalar_type() == executorch::aten::ScalarType::BFloat16) { + const auto* bf16_ptr = + output_tensor.template const_data_ptr(); + for (int i = 0; i < num_classes; ++i) { + logits_float[i] = static_cast(bf16_ptr[i]); + } + } else { + const float* float_ptr = output_tensor.template const_data_ptr(); + for (int i = 0; i < num_classes; ++i) { + logits_float[i] = float_ptr[i]; + } + } + + // Print top-k predictions + print_top_k(logits_float.data(), num_classes, FLAGS_top_k); + + return 0; +}