From f4c87c1146527326424167fec61293f9705e4e8d Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Sun, 15 Mar 2026 16:47:49 -0400
Subject: [PATCH 1/3] Update torch pin nightly to 20260310

---
 .ci/docker/ci_commit_pins/pytorch.txt         |  2 +-
 .ci/scripts/test_model_e2e.sh                 |  2 +-
 .../models/moshi/mimi/install_requirements.sh |  2 +-
 exir/sym_util.py                              |  5 ++-
 .../portable_type/c10/c10/util/complex_math.h | 35 +++++++++++++++++++
 .../c10/torch/headeronly/macros/Macros.h      |  2 +-
 torch_pin.py                                  |  4 +--
 7 files changed, 45 insertions(+), 7 deletions(-)
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index 96c16e31ac4..c90fb19f167 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-659af3c353e49b35c191cdd2dba3b3c79d0e6822
+08b6f48d871affbc7abe9277020aed882fdf110a
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
index cb7785036d3..0791b7b2df5 100755
--- a/.ci/scripts/test_model_e2e.sh
+++ b/.ci/scripts/test_model_e2e.sh
@@ -248,7 +248,7 @@ if [ "$AUDIO_URL" != "" ]; then
 elif [[ "$MODEL_NAME" == *whisper* ]] || [ "$MODEL_NAME" = "voxtral_realtime" ]; then
   conda install -y -c conda-forge "ffmpeg<8"
   pip install datasets soundfile
-  pip install torchcodec==0.11.0.dev20260217 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+  pip install torchcodec==0.11.0.dev20260312 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
   python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
 fi
 
diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh
index de179dc8c92..e76fce13128 100755
--- a/examples/models/moshi/mimi/install_requirements.sh
+++ b/examples/models/moshi/mimi/install_requirements.sh
@@ -8,7 +8,7 @@
 set -x
 
 sudo apt install ffmpeg -y
-pip install torchcodec==0.11.0.dev20260217 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install torchcodec==0.11.0.dev20260312 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 pip install moshi==0.2.11
 pip install bitsandbytes soundfile einops
 # Run llama2/install requirements for torchao deps
diff --git a/exir/sym_util.py b/exir/sym_util.py
index 64f4b64a32a..6b7a38ae224 100644
--- a/exir/sym_util.py
+++ b/exir/sym_util.py
@@ -25,7 +25,10 @@ def eval_expr(symint: Union[int, torch.SymInt]) -> Optional[int]:
     shape_env = node.shape_env
     expr = node.expr
     try:
-        output = shape_env.size_hint(expr)
+        if hasattr(shape_env, "guarding_hint_or_throw"):
+            output = shape_env.guarding_hint_or_throw(expr)
+        else:
+            output = shape_env.size_hint(expr)
     except torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode:
         return None
     return int(output)
diff --git a/runtime/core/portable_type/c10/c10/util/complex_math.h b/runtime/core/portable_type/c10/c10/util/complex_math.h
index 2b591026c94..d369df50592 100644
--- a/runtime/core/portable_type/c10/c10/util/complex_math.h
+++ b/runtime/core/portable_type/c10/c10/util/complex_math.h
@@ -86,6 +86,41 @@ C10_HOST_DEVICE inline c10::complex<T> pow(
 #endif
 }
 
+// Regression in ROCm 7.2. See https://github.com/ROCm/rocm-libraries/pull/3836.
+// Specialized version for complex<float> on AMD GPUs to use FMA-based
+// multiplication
+#if defined(__HIPCC__)
+namespace detail {
+// FMA-aware complex multiplication for float precision on AMD GPUs.
+// This prevents SLP vectorizer from breaking FMA formation, which causes
+// numerical precision loss in complex arithmetic.
+// The issue occurs when vectorizer packs scalar multiplies before backend
+// can form FMA instructions, resulting in double rounding instead of single.
+C10_HOST_DEVICE inline thrust::complex<float> complex_mul_fma(
+    thrust::complex<float> a,
+    thrust::complex<float> b) {
+  // Complex multiplication: (a.r + a.i*i) * (b.r + b.i*i)
+  // = (a.r*b.r - a.i*b.i) + (a.r*b.i + a.i*b.r)*i
+  // Using __builtin_fmaf ensures FMA at source level:
+  // real: a.r*b.r + (-(a.i*b.i)) = FMA(a.r, b.r, -(a.i*b.i))
+  // imag: a.i*b.r + a.r*b.i = FMA(a.r, b.i, a.i*b.r)
+  float real_part = __builtin_fmaf(a.real(), b.real(), -(a.imag() * b.imag()));
+  float imag_part = __builtin_fmaf(a.real(), b.imag(), a.imag() * b.real());
+  return thrust::complex<float>(real_part, imag_part);
+}
+} // namespace detail
+
+template <>
+C10_HOST_DEVICE inline c10::complex<float> pow(
+    const c10::complex<float>& x,
+    const c10::complex<float>& y) {
+  auto log_x = thrust::log(static_cast<thrust::complex<float>>(x));
+  auto y_log_x =
+      detail::complex_mul_fma(static_cast<thrust::complex<float>>(y), log_x);
+  return static_cast<c10::complex<float>>(thrust::exp(y_log_x));
+}
+#endif
+
 template <typename T>
 C10_HOST_DEVICE inline c10::complex<T> pow(
     const c10::complex<T>& x,
diff --git a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
index 63aa0d20d8e..880e741abf6 100644
--- a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
+++ b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
@@ -629,7 +629,7 @@ __host__ __device__
 // This macro is used to find older C++ compilers
 // that don't support move optimization for return values.
 
-#if (defined(__GNUC__) && __GNUC__ < 13) || \
+#if (defined(__GNUC__) && __GNUC__ < 13 && __cplusplus < 202002L) || \
     (defined(__clang_major__) && __clang_major__ < 13)
 #define C10_RETURN_MOVE_IF_OLD_COMPILER 1
 #else
diff --git a/torch_pin.py b/torch_pin.py
index 2dd1ac62f51..337054b5dd5 100644
--- a/torch_pin.py
+++ b/torch_pin.py
@@ -1,2 +1,2 @@
-TORCH_VERSION = "2.11.0"
-NIGHTLY_VERSION = "dev20260215"
+TORCH_VERSION = "2.12.0"
+NIGHTLY_VERSION = "dev20260312"

From bfe854dd75036a8d0d2000966aff2b0f05a100e7 Mon Sep 17 00:00:00 2001
From: Manuel Candales <42380156+manuelcandales@users.noreply.github.com>
Date: Tue, 17 Mar 2026 12:52:24 -0400
Subject: [PATCH 2/3] fix linear bias decomposition invokation (#18168)

Fix parakeet/voxtral realtime export using PyTorch release. In both
`export_parakeet_tdt.py` and `export_voxtral_rt.py`, replaced the manual
decomposition dictionary with `torch.export.default_decompositions()`
and added the custom `_linear_bias_decomposition` for
`torch.ops.aten.linear.default`.
---
 examples/models/parakeet/export_parakeet_tdt.py       | 8 +++-----
 examples/models/voxtral_realtime/export_voxtral_rt.py | 6 +++---
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/examples/models/parakeet/export_parakeet_tdt.py b/examples/models/parakeet/export_parakeet_tdt.py
index f3ed0d2b070..8dd9accd866 100644
--- a/examples/models/parakeet/export_parakeet_tdt.py
+++ b/examples/models/parakeet/export_parakeet_tdt.py
@@ -508,13 +508,11 @@ def _create_metal_partitioners(programs):
 
     # Run decompositions for non-preprocessor programs
     updated_programs = {}
+    decomp_table = torch.export.default_decompositions()
+    decomp_table[torch.ops.aten.linear.default] = _linear_bias_decomposition
     for key, ep in programs.items():
-        # print(f"Running decompositions for {key}")
-        # print(ep.graph_module)
         if key != "preprocessor":
-            updated_programs[key] = ep.run_decompositions(
-                {torch.ops.aten.linear.default: _linear_bias_decomposition}
-            )
+            updated_programs[key] = ep.run_decompositions(decomp_table)
         else:
             updated_programs[key] = ep
 
diff --git a/examples/models/voxtral_realtime/export_voxtral_rt.py b/examples/models/voxtral_realtime/export_voxtral_rt.py
index d3fc9323806..951f1f606d5 100644
--- a/examples/models/voxtral_realtime/export_voxtral_rt.py
+++ b/examples/models/voxtral_realtime/export_voxtral_rt.py
@@ -394,10 +394,10 @@ def lower_to_executorch(programs, metadata, backend="xnnpack"):
 
         # Run decompositions for Metal backend
         updated_programs = {}
+        decomp_table = torch.export.default_decompositions()
+        decomp_table[torch.ops.aten.linear.default] = _linear_bias_decomposition
         for key, ep in programs.items():
-            updated_programs[key] = ep.run_decompositions(
-                {torch.ops.aten.linear.default: _linear_bias_decomposition}
-            )
+            updated_programs[key] = ep.run_decompositions(decomp_table)
         programs = updated_programs
 
         partitioner = {}

From 96b20df51d216ab409e526563d007681d3df04cc Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Wed, 18 Mar 2026 07:05:31 -0400
Subject: [PATCH 3/3] Update to 3.18

---
 .ci/docker/ci_commit_pins/pytorch.txt              | 2 +-
 .ci/scripts/test_model_e2e.sh                      | 2 +-
 examples/models/moshi/mimi/install_requirements.sh | 2 +-
 torch_pin.py                                       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index c90fb19f167..bfe95e4c6c5 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-08b6f48d871affbc7abe9277020aed882fdf110a
+56676a60b250217621ce514d8a3f3cf576cead70
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
index 0791b7b2df5..f6e7333d9f2 100755
--- a/.ci/scripts/test_model_e2e.sh
+++ b/.ci/scripts/test_model_e2e.sh
@@ -248,7 +248,7 @@ if [ "$AUDIO_URL" != "" ]; then
 elif [[ "$MODEL_NAME" == *whisper* ]] || [ "$MODEL_NAME" = "voxtral_realtime" ]; then
   conda install -y -c conda-forge "ffmpeg<8"
   pip install datasets soundfile
-  pip install torchcodec==0.11.0.dev20260312 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+  pip install torchcodec==0.11.0.dev20260318 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
   python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
 fi
 
diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh
index e76fce13128..ab494ba4388 100755
--- a/examples/models/moshi/mimi/install_requirements.sh
+++ b/examples/models/moshi/mimi/install_requirements.sh
@@ -8,7 +8,7 @@
 set -x
 
 sudo apt install ffmpeg -y
-pip install torchcodec==0.11.0.dev20260312 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install torchcodec==0.11.0.dev20260318 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 pip install moshi==0.2.11
 pip install bitsandbytes soundfile einops
 # Run llama2/install requirements for torchao deps
diff --git a/torch_pin.py b/torch_pin.py
index 337054b5dd5..1b812045c99 100644
--- a/torch_pin.py
+++ b/torch_pin.py
@@ -1,2 +1,2 @@
 TORCH_VERSION = "2.12.0"
-NIGHTLY_VERSION = "dev20260312"
+NIGHTLY_VERSION = "dev20260318"