hw-native-sys · jvjhfhg · Mar 19, 2026 · gemini-code-assist · Mar 19, 2026
diff --git a/...ormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp b/...ormap_and_ringbuffer/batch_paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -191,7 +191,7 @@ void aicpu_orchestration_entry(PTO2Runtime* rt, uint64_t* args, int arg_count, i
                     params_up.add_input(oi_new_b);
                     params_up.add_inout(mi_batch);
                     params_up.add_inout(li_batch);
-                    params_up.add_output(oi_batch);
+                    params_up.add_inout(oi_batch);
                     params_up.add_output(out);
                     params_up.add_scalar(is_first);
                     params_up.add_scalar(is_last);

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/MULTI_TENSORMAP_REWRITE.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/MULTI_TENSORMAP_REWRITE.md
@@ -0,0 +1,167 @@
+# Multi-TensorMap Rewrite
+
+Date: 2026-03-19
+
+## Background
+
+The old multi-tensormap direction had two persistent problems:
+
+- fallback history did not share the same lifecycle model as owner history
+- owner and fallback logic were drifting toward separate implementations
+
+The rewrite fixes both by making producer retirement the only lifecycle
+source and forcing owner/fallback to share one shard implementation.
+
+## Goals
+
+- Keep same-ring owner history on a ring-local fast path
+- Support cross-ring `INOUT` and external tensors through fallback storage
+- Bind stale/cleanup semantics to real producer retirement
+- Keep `sync_tensormap()` interface unchanged
+- Force owner and fallback to share one core implementation
+
+## Tensor Model
+
+`Tensor.ring_id` means tensor owner ring.
+
+- `ring_id in [0, PTO2_MAX_RING_DEPTH)`: internal tensor
+- `ring_id == TENSOR_RING_ID_NONE`: external tensor
+
+Submit rules:
+
+- internal `OUTPUT` must satisfy `tensor.ring_id == submit_ring`
+- `INOUT` and `INPUT` must not rewrite owner ring at submit time
+- external tensors stay external; runtime must not silently assign an
+  internal owner ring
+
+## Entry Model
+
+Each tensormap entry stores:
+
+- `producer_task_id`: the real producer task
+- `tensor_owner_ring`: the tensor owner ring, or `TENSOR_RING_ID_NONE`
+- `storage_domain`: `OWNER_MAP` or `FALLBACK_MAP`
+- overlap metadata: address, version, shape, offsets
+- `with_alloc`: whether this history entry came from runtime allocation
+
+The entry does not store a separate fallback lifecycle key.
+
+Two derived values drive lifecycle handling:
+
+- `producer_ring = producer_task_id.ring()`
+- `producer_local = producer_task_id.local()`
+
+## Shared Shard Core
+
+Owner and fallback both use the same template:
+
+```cpp
+template <int32_t NumCleanupDomains, bool BreakOnStale>
+struct TensorMapShardImpl;
+```
+
+Concrete instances:
+
+- `OwnerTensorMapShard = TensorMapShardImpl<1, true>`
+- `FallbackTensorMapShard = TensorMapShardImpl<PTO2_MAX_RING_DEPTH, false>`
+
+This keeps one method body for:
+
+- `init`
+- `destroy`
+- `lookup`
+- `insert`
+- `remove_entry`
+- `cleanup_range`
+
+Differences are expressed only through template parameters and entry
+metadata, not through specialized method bodies.
+
+## Cleanup Domains
+
+`cleanup_domain` is a shard-local concept, not a stored field.
+
+For owner shards:
+
+- there is exactly one cleanup domain
+- every entry maps to cleanup domain `0`
+
+For fallback shard:
+
+- there is one cleanup domain per producer ring
+- an entry maps to `producer_task_id.ring()`
+
+This is why fallback mirrors `last_task_alive[ring]` for every producer
+ring instead of maintaining a fake global frontier.
+
+## Routing Rules
+
+### Lookup
+
+- internal tensor: query owner shard first, then fallback shard
+- external tensor: query fallback shard only
+
+### Insert
+
+- internal `OUTPUT`: owner shard of the submit ring
+- same-ring internal `INOUT`: owner shard of the submit ring
+- cross-ring internal `INOUT`: fallback shard
+- external `OUTPUT` / `INOUT`: fallback shard
+
+### Remove
+
+`remove_entry()` routes by `storage_domain`:
+
+- `OWNER_MAP`: remove from the owner shard indexed by `tensor_owner_ring`
+- `FALLBACK_MAP`: remove from fallback shard
+
+## Cleanup Semantics
+
+Stale is defined only by producer retirement.
+
+Shared validity rule:
+
+```cpp
+entry.producer_task_id.local() >=
+    shard.last_task_alives[cleanup_domain_of(entry)]
+```
+
+Lookup behavior:
+
+- owner shards may `break` on first stale entry because each owner shard is
+  a single lifecycle domain
+- fallback shard must continue scanning because its bucket chains mix
+  producer rings
+
+Cleanup behavior:
+
+- `sync_tensormap()` reads real `last_task_alive` values from shared memory
+- owner shard `R` cleans retired range on domain `0`
+- fallback shard cleans retired range on domain `R`
+
+No fallback-private lifecycle frontier exists.
+
+## Main Invariants
+
+Owner shard entry:
+
+- `storage_domain == OWNER_MAP`
+- `tensor_owner_ring == producer_task_id.ring()`
+
+Fallback shard entry:
+
+- `storage_domain == FALLBACK_MAP`
+- cleanup is driven only by `producer_task_id.ring()`
+
+Global invariant:
+
+- owner and fallback share one core implementation
+- differences must not grow into two independent algorithms
+
+## Current Implementation Notes
+
+The committed implementation also keeps two important user-facing choices:
+
+- `sync_tensormap(uint8_t ring_id, int32_t sm_last_task_alive)` stays
+  unchanged
+- `with_alloc` follows allocation semantics, not `PTOParamType` alone
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
@@ -23,9 +23,18 @@
 
 // Type headers needed by orchestration
 #include "pto_types.h"          // PTOParam, PTOTensorEntry, PTOParamType
-#include "tensor.h"             // Tensor, make_tensor, make_tensor_external
+#include "tensor.h"             // Tensor struct
 #include "pto_submit_types.h"   // MixedKernels, INVALID_KERNEL_ID, subtask slots
 
+// Multi-ring: number of independent ring layers (HeapRing + TaskRing + DepPool per layer)
+// Scope depth maps to ring index via: min(scope_depth, PTO2_MAX_RING_DEPTH - 1)
+#define PTO2_MAX_RING_DEPTH 4
+
+// Thread-local scope depth for tensor factory functions.
+// Incremented/decremented by PTO2ScopeGuard and standalone scope wrappers.
+// Tensor ring selection clamps this depth to the runtime's valid ring range.
+static thread_local uint8_t __pto2_ring_id = 0;
+
 // =============================================================================
 // Ops Table and Opaque Runtime
 // =============================================================================
@@ -99,10 +108,12 @@ static inline void pto2_rt_submit_aiv_task(PTO2Runtime* rt, int32_t kernel_id,
 
 static inline void pto2_rt_scope_begin(PTO2Runtime* rt) {
     rt->ops->scope_begin(rt);
+    __pto2_ring_id++;
 }
 
 static inline void pto2_rt_scope_end(PTO2Runtime* rt) {
     rt->ops->scope_end(rt);
+    __pto2_ring_id--;
 }
 
 static inline void pto2_rt_orchestration_done(PTO2Runtime* rt) {
@@ -113,6 +124,59 @@ static inline bool pto2_rt_is_fatal(PTO2Runtime* rt) {
     return rt->ops->is_fatal(rt);
 }
 
+// =============================================================================
+// Tensor Factory Functions
+// =============================================================================
+
+/**
+ * Create a Tensor for pre-allocated external memory.
+ */
+static inline Tensor make_tensor_external(void* addr,
+    const uint32_t shapes[],
+    uint32_t ndims,
+    DataType dtype = DataType::FLOAT32,
+    bool manual_dep = false,
+    int32_t version = 0) {
+    static uint32_t zero_offsets[RUNTIME_MAX_TENSOR_DIMS] = {};
+    uint64_t total = 1;
+    for (uint32_t i = 0; i < ndims; i++) {
+        total *= shapes[i];
+    }
+    return Tensor(addr, total * get_element_size(dtype), shapes, shapes, zero_offsets, ndims, dtype, version,
+                  /*is_all_offset_zero=*/true, /*is_raw_eq_shapes=*/true, manual_dep,
+                  TENSOR_RING_ID_NONE);
+}
+
+static inline Tensor make_tensor_with_ring(const uint32_t shapes[],
+    uint32_t ndims,
+    DataType dtype,
+    bool manual_dep,
+    int32_t version,
+    uint8_t ring_id) {
+    static uint32_t zero_offsets[RUNTIME_MAX_TENSOR_DIMS] = {};
+    uint64_t total = 1;
+    for (uint32_t i = 0; i < ndims; i++) {
+        total *= shapes[i];
+    }
+    return Tensor(0, total * get_element_size(dtype), shapes, shapes, zero_offsets, ndims, dtype, version,
+                  /*is_all_offset_zero=*/true, /*is_raw_eq_shapes=*/true, manual_dep, ring_id);
+}
+
+static inline uint8_t current_tensor_ring_id() {
+    return __pto2_ring_id < PTO2_MAX_RING_DEPTH ? __pto2_ring_id : PTO2_MAX_RING_DEPTH - 1;
+}
+
+/**
+ * Create a Tensor for runtime-allocated output (addr=0).
+ * Uses the thread-local scope depth set by PTO2ScopeGuard, clamped to the
+ * runtime ring range to match PTO2OrchestratorState::current_ring_id().
+ */
+static inline Tensor make_tensor(const uint32_t shapes[], uint32_t ndims,
+    DataType dtype = DataType::FLOAT32, bool manual_dep = false,
+    int32_t version = 0) {
+    return make_tensor_with_ring(shapes, ndims, dtype, manual_dep, version, current_tensor_ring_id());
+}
+
 // =============================================================================
 // Logging Macros for Orchestration (call through ops table)
 // =============================================================================
@@ -133,10 +197,10 @@ static inline bool pto2_rt_is_fatal(PTO2Runtime* rt) {
 class PTO2ScopeGuard {
 public:
     PTO2ScopeGuard(PTO2Runtime* rt) : rt_(rt) {
-        rt_->ops->scope_begin(rt_);
+        pto2_rt_scope_begin(rt_);
     }
     ~PTO2ScopeGuard() {
-        rt_->ops->scope_end(rt_);
+        pto2_rt_scope_end(rt_);
     }
 private:
     PTO2Runtime* rt_;

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -390,10 +390,12 @@ void pto2_submit_mixed_task(
     CYCLE_COUNT_LAP_RECORD(g_orch_alloc_cycle, AicpuPhaseId::ORCH_ALLOC, local_id);
 
     // === STEP 2: Calculate output size + heap alloc (read from params only, no GM access) ===
+    bool needs_alloc[PTO2_MAX_TENSOR_PARAMS] = {};
     int32_t total_output_size = 0;
     for (int i = 0; i < params.tensor_count; i++) {
         if (params.tensor_types[i] == PTOParamType::OUTPUT
             && params.tensors[i]->buffer.addr == 0) {
+            needs_alloc[i] = true;
             total_output_size += PTO2_ALIGN_UP(params.tensors[i]->buffer.size, PTO2_PACKED_OUTPUT_ALIGN);
         }
     }
@@ -491,7 +493,7 @@ void pto2_submit_mixed_task(
         PTOParamType ptype = params.tensor_types[i];
         if (ptype == PTOParamType::OUTPUT || ptype == PTOParamType::INOUT) {
             if (!params.tensors[i]->manual_dep) {
-                orch->tensor_map.insert(*params.tensors[i], mixed_task_id, ptype == PTOParamType::OUTPUT);
+                orch->tensor_map.insert(*params.tensors[i], mixed_task_id, ptype, needs_alloc[i]);
             }
         }
     }