diff --git a/backends/arm/scripts/evaluate_model.py b/backends/arm/scripts/evaluate_model.py
new file mode 100644
index 00000000000..c2b17e17c9e
--- /dev/null
+++ b/backends/arm/scripts/evaluate_model.py
@@ -0,0 +1,393 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# Copyright 2023-2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import copy
+import json
+import logging
+import os
+import sys
+
+from pathlib import Path
+
+# Add Executorch root to path so this script can be run from anywhere
+_EXECUTORCH_DIR = Path(__file__).resolve().parents[3]
+_EXECUTORCH_DIR_STR = str(_EXECUTORCH_DIR)
+if _EXECUTORCH_DIR_STR not in sys.path:
+    sys.path.insert(0, _EXECUTORCH_DIR_STR)
+
+from typing import Any
+
+import torch
+
+from backends.arm.util.arm_model_evaluator import (
+    Evaluator,
+    FileCompressionEvaluator,
+    ImageNetEvaluator,
+    NumericalModelEvaluator,
+)
+from examples.arm.aot_arm_compiler import (
+    CALIBRATION_MAX_SAMPLES,
+    dump_delegation_info,
+    get_model_and_inputs_from_name,
+    load_calibration_samples,
+    quantize_model,
+    QuantMode,
+)
+
+from examples.models import MODEL_NAME_TO_MODEL
+from executorch.backends.arm.tosa import TosaSpecification
+from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
+from executorch.backends.arm.util._factory import create_partitioner
+
+from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
+from torch.utils.data import DataLoader
+
+
+_EVALUATORS = [
+    "numerical",
+    "imagenet",
+]
+
+_QUANT_MODES = [
+    "int8",
+    "a16w8",
+]
+
+_DTYPE_MAP = {
+    "fp32": torch.float32,
+    "fp16": torch.float16,
+    "bf16": torch.bfloat16,
+}
+
+
+def _get_args():
+    parser = argparse.ArgumentParser(
+        "Evaluate a model quantized and/or delegated for the Arm backend."
+        " Evaluations include numerical comparison to the original model"
+        "and/or top-1/top-5 accuracy if applicable."
+    )
+    parser.add_argument(
+        "-m",
+        "--model_name",
+        required=True,
+        help="Model file .py/.pth/.pt or a model from examples/models."
+        f" Available models from examples/models: {', '.join(MODEL_NAME_TO_MODEL.keys())}",
+    )
+    parser.add_argument(
+        "-t",
+        "--target",
+        action="store",
+        required=True,
+        help=(
+            "For Arm backend delegated models, pick the target."
+            " Examples of valid targets: TOSA-1.0+INT, TOSA-1.0+FP+bf16"
+        ),
+    )
+    parser.add_argument(
+        "-q",
+        "--quant_mode",
+        required=False,
+        default=None,
+        choices=_QUANT_MODES,
+        help="Quantize the model using the requested mode.",
+    )
+    parser.add_argument(
+        "--calibration_data",
+        required=False,
+        default=None,
+        help=(
+            "Optional calibration data file or directory. If a directory is "
+            "provided, up to 1000 samples are used for calibration. "
+            "Supported files: Common image formats (e.g., .png or .jpg) if "
+            "using imagenet evaluator, otherwise .pt/.pth files. If not provided,"
+            "quantized models are calibrated on their example inputs."
+        ),
+    )
+    parser.add_argument(
+        "--no_delegate",
+        action="store_false",
+        dest="delegate",
+        default=True,
+        help=(
+            "Disable delegation for cases where a quantized but non-delegated "
+            "model is to be tested."
+        ),
+    )
+    parser.add_argument(
+        "-e",
+        "--evaluators",
+        required=True,
+        help=(
+            "Comma-separated list of evaluators to use. " f"Valid values: {_EVALUATORS}"
+        ),
+    )
+    parser.add_argument(
+        "--evaluation_dataset",
+        required=False,
+        default=None,
+        help="Provide path to evaluation dataset directory. (only applicable for ImageNet evaluation).",
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        required=False,
+        default=1,
+        help="Batch size to use for ImageNet evaluation. (only applicable for ImageNet evaluation).",
+    )
+    parser.add_argument(
+        "-s",
+        "--so_library",
+        required=False,
+        default=None,
+        help="Path to .so library to load custom ops from before evaluation.",
+    )
+    parser.add_argument(
+        "--debug", action="store_true", help="Set the logging level to debug."
+    )
+    parser.add_argument(
+        "--dtype",
+        choices=sorted(_DTYPE_MAP.keys()),
+        default=None,
+        help="Cast the model to evaluate and its inputs to the given dtype.",
+    )
+    parser.add_argument(
+        "-i",
+        "--intermediates",
+        action="store",
+        required=True,
+        help="Store intermediate output (like TOSA artifacts) at the specified directory.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        required=False,
+        default=None,
+        help="Path to JSON file where evaluation metrics will be stored.",
+    )
+    args = parser.parse_args()
+
+    LOGGING_FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
+    logging_level = logging.DEBUG if args.debug else logging.WARNING
+    logging.basicConfig(level=logging_level, format=LOGGING_FORMAT, force=True)
+
+    if args.quant_mode is None and not args.delegate:
+        raise ValueError(
+            "The model to test must be either quantized or delegated (--quant_mode or --delegate)."
+        )
+
+    if args.calibration_data is not None and args.quant_mode is None:
+        raise ValueError("--calibration_data requires --quant_mode to be enabled.")
+
+    if args.quant_mode is not None and args.dtype is not None:
+        raise ValueError("Cannot specify --dtype when --quant_mode is enabled.")
+
+    evaluators: list[Evaluator] = [
+        entry.strip() for entry in args.evaluators.split(",") if entry.strip()
+    ]
+    unknown = [entry for entry in evaluators if entry not in _EVALUATORS]
+    if not evaluators:
+        raise ValueError("At least one evaluator must be specified in --evaluators.")
+    if unknown:
+        raise ValueError(
+            "Unknown evaluators in --evaluators: " f"{', '.join(sorted(set(unknown)))}"
+        )
+    args.evaluators = evaluators
+
+    if "imagenet" in args.evaluators and args.evaluation_dataset is None:
+        raise ValueError("Evaluation dataset must be provided for ImageNet evaluation.")
+
+    # Default output path to intermediates folder with name based on target and extensions
+    if args.output is None:
+        args.output = os.path.join(args.intermediates, f"{args.target}_metrics.json")
+
+    try:
+        TosaSpecification.create_from_string(args.target)
+    except ValueError as e:
+        raise ValueError(f"Invalid target format for --target: {e}")
+
+    return args
+
+
+def _get_compile_spec(args) -> TosaCompileSpec:
+    tosa_spec = TosaSpecification.create_from_string(args.target)
+    compile_spec = TosaCompileSpec(tosa_spec)
+
+    if args.intermediates is not None:
+        compile_spec.dump_intermediate_artifacts_to(args.intermediates)
+
+    return compile_spec
+
+
+def _build_imagenet_calibration_samples(
+    calibration_dir: str, max_samples: int
+) -> list[tuple[torch.Tensor, ...]]:
+    dataset = ImageNetEvaluator.load_imagenet_folder(calibration_dir)
+    loader = DataLoader(dataset, batch_size=1, shuffle=False)
+    samples: list[tuple[torch.Tensor, ...]] = []
+    for image, _ in loader:
+        samples.append((image,))
+        if len(samples) >= max_samples:
+            break
+    return samples
+
+
+def _evaluate(
+    args, model_name, ref_model, eval_model, example_inputs
+) -> dict[str, Any]:
+    evaluators: list[Evaluator] = []
+
+    # Add evaluator for compression ratio of TOSA file
+    intermediates_path = Path(args.intermediates)
+    tosa_paths = list(intermediates_path.glob("*.tosa"))
+    if tosa_paths:
+        evaluators.append(FileCompressionEvaluator(model_name, str(tosa_paths[0])))
+    else:
+        logging.warning(
+            f"No TOSA file found in {args.intermediates} for compression evaluation"
+        )
+
+    # Add user-specified evaluators
+    for evaluator_name in args.evaluators:
+        evaluator: Evaluator
+        match evaluator_name:
+            case "numerical":
+                evaluator = NumericalModelEvaluator(
+                    model_name,
+                    ref_model,
+                    eval_model,
+                    example_inputs,
+                    eval_dtype=_DTYPE_MAP.get(args.dtype, None),
+                )
+            case "imagenet":
+                evaluator = ImageNetEvaluator(
+                    model_name,
+                    eval_model,
+                    batch_size=args.batch_size,
+                    validation_dataset_path=args.evaluation_dataset,
+                    eval_dtype=_DTYPE_MAP.get(args.dtype, None),
+                )
+            case _:
+                raise AssertionError(f"Unknown evaluator {evaluator_name}")
+        evaluators.append(evaluator)
+
+    # Run evaluators
+    metrics: dict[str, Any] = {}
+    for evaluator in evaluators:
+        result = evaluator.evaluate()
+        metrics |= result
+
+    return metrics
+
+
+def main() -> None:
+    try:
+        args = _get_args()
+    except ValueError as e:
+        logging.error(f"Argument error: {e}")
+        sys.exit(1)
+
+    # if we have custom ops, register them before processing the model
+    if args.so_library is not None:
+        logging.info(f"Loading custom ops from {args.so_library}")
+        torch.ops.load_library(args.so_library)
+
+    # Get the model and its example inputs
+    original_model, example_inputs = get_model_and_inputs_from_name(
+        args.model_name, None
+    )
+
+    # Use original model as reference to compare against
+    ref_model = original_model.eval()
+    eval_model = ref_model
+    eval_inputs = example_inputs
+
+    # Cast model and inputs to eval_dtype if specified
+    if args.dtype is not None:
+        eval_dtype = _DTYPE_MAP[args.dtype]
+        eval_model = copy.deepcopy(original_model).to(eval_dtype).eval()
+        eval_inputs = tuple(
+            inp.to(eval_dtype) if isinstance(inp, torch.Tensor) else inp
+            for inp in example_inputs
+        )
+
+    # Export the model
+    exported_program = torch.export.export(eval_model, eval_inputs)
+
+    model_name = os.path.basename(os.path.splitext(args.model_name)[0])
+    if args.intermediates:
+        os.makedirs(args.intermediates, exist_ok=True)
+
+        # We only support Python3.10 and above, so use a later pickle protocol
+        torch.export.save(
+            exported_program,
+            f"{args.intermediates}/{model_name}_exported_program.pt2",
+            pickle_protocol=5,
+        )
+
+    compile_spec = _get_compile_spec(args)
+
+    # Quantize the model if requested
+    if args.quant_mode is not None:
+        calibration_samples = None
+        if (
+            "imagenet" in args.evaluators
+            and args.calibration_data is not None
+            and Path(args.calibration_data).is_dir()
+        ):
+            calibration_samples = _build_imagenet_calibration_samples(
+                args.calibration_data, CALIBRATION_MAX_SAMPLES
+            )
+        else:
+            calibration_samples = load_calibration_samples(
+                args.calibration_data, example_inputs
+            )
+
+        match args.quant_mode:
+            case "a16w8":
+                quant_mode = QuantMode.A16W8
+            case "int8":
+                quant_mode = QuantMode.INT8
+            case _:
+                raise AssertionError(f"Unknown quantization mode: {args.quant_mode}")
+
+        eval_model, exported_program = quantize_model(
+            exported_program.module(),
+            eval_inputs,
+            compile_spec,
+            model_name,
+            True,
+            quant_mode,
+            calibration_samples,
+        )
+
+    # Delegate the model to Arm backend if requested
+    if args.delegate:
+        partitioner = create_partitioner(compile_spec)
+        edge = to_edge_transform_and_lower(
+            exported_program,
+            partitioner=[partitioner],
+            compile_config=EdgeCompileConfig(
+                _check_ir_validity=False,
+            ),
+        )
+        exported_program = edge.exported_program()
+        eval_model = exported_program.module()
+
+        dump_delegation_info(edge, args.intermediates)
+
+    # Evaluate the model
+    metrics = _evaluate(args, model_name, ref_model, eval_model, example_inputs)
+
+    # Dump result as JSON
+    output = {"name": model_name, "target": args.target, "metrics": metrics}
+    with open(args.output, "w") as f:
+        json.dump(output, f, indent=4)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/backends/arm/test/misc/test_evaluate_model.py b/backends/arm/test/misc/test_evaluate_model.py
new file mode 100644
index 00000000000..654789c2cf7
--- /dev/null
+++ b/backends/arm/test/misc/test_evaluate_model.py
@@ -0,0 +1,71 @@
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import sys
+from pathlib import Path
+
+from backends.arm.scripts import evaluate_model
+
+
+def _run_evaluate_model(*args: str) -> None:
+    previous_argv = sys.argv
+    try:
+        sys.argv = ["evaluate_model.py", *args]
+        evaluate_model.main()
+    finally:
+        sys.argv = previous_argv
+
+
+def test_evaluate_model_tosa_INT(tmp_path: Path) -> None:
+    intermediates = tmp_path / "test_evaluate_model_tosa_INT_intermediates"
+    output = tmp_path / "test_evaluate_model_tosa_INT_metrics.json"
+
+    _run_evaluate_model(
+        "--model_name",
+        "add",
+        "--target",
+        "TOSA-1.0+INT",
+        "--quant_mode",
+        "int8",
+        "--no_delegate",
+        "--evaluators",
+        "numerical",
+        "--intermediates",
+        str(intermediates),
+        "--output",
+        str(output),
+    )
+
+    assert output.exists(), f"Metrics file not created at {output}"
+    data = json.loads(output.read_text())
+    assert data["name"] == "add"
+    assert "metrics" in data
+    assert "mean_absolute_error" in data["metrics"]
+
+
+def test_evaluate_model_tosa_FP(tmp_path: Path) -> None:
+    intermediates = tmp_path / "test_evaluate_model_tosa_FP_intermediates"
+    output = tmp_path / "test_evaluate_model_tosa_FP_metrics.json"
+
+    _run_evaluate_model(
+        "--model_name",
+        "add",
+        "--target",
+        "TOSA-1.0+FP",
+        "--evaluators",
+        "numerical",
+        "--intermediates",
+        str(intermediates),
+        "--output",
+        str(output),
+    )
+
+    assert output.exists(), f"Metrics file not created at {output}"
+    data = json.loads(output.read_text())
+    assert data["name"] == "add"
+    assert "metrics" in data
+    assert "mean_absolute_error" in data["metrics"]
+    assert "compression_ratio" in data["metrics"]
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index ba2fba539dc..f0d9f5d0537 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -682,7 +682,8 @@ def _get_args():
 
     if args.evaluate is not None or args.evaluate_config is not None:
         logging.error(
-            "Model evaluation is no longer supported in this script. Ignore and continue."
+            "Model evaluation is no longer supported in this script."
+            " Use evaluate_model.py instead. Ignore and continue."
         )
 
     return args