[HAL/AMDGPU] Track upload ring reclaim positions Extend notification-ring reclaim entries so queue-owned upload bytes can retire through the same completion epoch as kernargs. The new reclaim-position API reports both kernarg and upload ring watermarks, while the existing kernarg-only wrappers remain for current callers and tests that do not care about upload storage. Thread the upload watermark through kernel-shaped host queue submissions, including the failed-submission noop path that plugs already-reserved AQL slots. Host queue drain and teardown now reclaim all queue-owned ring positions through one helper; the upload watermark stays zero until a submission path actually allocates upload spans, so static command-buffer replay remains untouched. Add notification-ring coverage for reporting both queue-owned watermarks across zero-signal epochs.

diff --git a/runtime/src/iree/hal/drivers/amdgpu/host_queue.c b/runtime/src/iree/hal/drivers/amdgpu/host_queue.c
index 44d659f..d5dff81 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/host_queue.c
+++ b/runtime/src/iree/hal/drivers/amdgpu/host_queue.c

@@ -89,6 +89,23 @@
       queue, queue_device_reservation);
 }
 
+static void iree_hal_amdgpu_host_queue_reclaim_queue_owned_positions(
+    iree_hal_amdgpu_host_queue_t* queue,
+    iree_hal_amdgpu_reclaim_positions_t reclaim_positions) {
+  if (reclaim_positions.kernarg_write_position > 0) {
+    iree_hal_amdgpu_kernarg_ring_reclaim(
+        &queue->kernarg_ring, reclaim_positions.kernarg_write_position);
+  }
+  if (reclaim_positions.queue_upload_write_position > 0) {
+    IREE_ASSERT(queue->queue_upload_ring.base,
+                "queue upload bytes retired without an initialized upload "
+                "ring");
+    iree_hal_amdgpu_queue_upload_ring_reclaim(
+        &queue->queue_upload_ring,
+        reclaim_positions.queue_upload_write_position);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Initialization / deinitialization
 //===----------------------------------------------------------------------===//
@@ -141,20 +158,20 @@
       &queue->error_status, iree_memory_order_acquire);
   const uint64_t previous_epoch = (uint64_t)iree_atomic_load(
       &queue->notification_ring.epoch.last_drained, iree_memory_order_relaxed);
-  uint64_t kernarg_reclaim_position = 0;
+  iree_hal_amdgpu_reclaim_positions_t reclaim_positions = {0};
   iree_host_size_t count = 0;
   if (IREE_UNLIKELY(error)) {
-    count = iree_hal_amdgpu_notification_ring_fail_all(
-        &queue->notification_ring, error, &kernarg_reclaim_position);
+    count = iree_hal_amdgpu_notification_ring_fail_all_reclaim_positions(
+        &queue->notification_ring, error, &reclaim_positions);
     iree_hal_amdgpu_host_queue_clear_profile_events(queue);
     iree_async_frontier_tracker_fail_axis(
         queue->frontier_tracker, queue->axis,
         iree_status_from_code(iree_status_code(error)));
   } else {
-    count = iree_hal_amdgpu_notification_ring_drain(
+    count = iree_hal_amdgpu_notification_ring_drain_reclaim_positions(
         &queue->notification_ring,
         /*fallback_frontier=*/NULL, iree_hal_amdgpu_host_queue_reclaim_retired,
-        queue, &kernarg_reclaim_position);
+        queue, &reclaim_positions);
     const uint64_t current_epoch =
         (uint64_t)iree_atomic_load(&queue->notification_ring.epoch.last_drained,
                                    iree_memory_order_acquire);
@@ -163,10 +180,8 @@
                                           current_epoch);
     }
   }
-  if (kernarg_reclaim_position > 0) {
-    iree_hal_amdgpu_kernarg_ring_reclaim(&queue->kernarg_ring,
-                                         kernarg_reclaim_position);
-  }
+  iree_hal_amdgpu_host_queue_reclaim_queue_owned_positions(queue,
+                                                           reclaim_positions);
   iree_hal_amdgpu_host_queue_run_post_drain_actions(queue);
   return count;
 }
@@ -539,22 +554,20 @@
   // error. Otherwise drain normally (entries completed but not yet processed).
   iree_status_t error = (iree_status_t)iree_atomic_load(
       &queue->error_status, iree_memory_order_acquire);
-  uint64_t kernarg_reclaim_position = 0;
+  iree_hal_amdgpu_reclaim_positions_t reclaim_positions = {0};
   if (!iree_status_is_ok(error)) {
-    iree_hal_amdgpu_notification_ring_fail_all(&queue->notification_ring, error,
-                                               &kernarg_reclaim_position);
+    iree_hal_amdgpu_notification_ring_fail_all_reclaim_positions(
+        &queue->notification_ring, error, &reclaim_positions);
     iree_hal_amdgpu_host_queue_clear_profile_events(queue);
     iree_status_free(error);
   } else {
-    iree_hal_amdgpu_notification_ring_drain(
+    iree_hal_amdgpu_notification_ring_drain_reclaim_positions(
         &queue->notification_ring,
         /*fallback_frontier=*/NULL, iree_hal_amdgpu_host_queue_reclaim_retired,
-        queue, &kernarg_reclaim_position);
+        queue, &reclaim_positions);
   }
-  if (kernarg_reclaim_position > 0) {
-    iree_hal_amdgpu_kernarg_ring_reclaim(&queue->kernarg_ring,
-                                         kernarg_reclaim_position);
-  }
+  iree_hal_amdgpu_host_queue_reclaim_queue_owned_positions(queue,
+                                                           reclaim_positions);
   iree_hal_amdgpu_host_queue_run_post_drain_actions(queue);
 
   // Deregister from the epoch signal table before destroying the notification
@@ -577,6 +590,11 @@
 
   iree_hal_amdgpu_notification_ring_deinitialize(&queue->notification_ring);
 
+  if (queue->queue_upload_ring.base) {
+    iree_hal_amdgpu_queue_upload_ring_deinitialize(queue->libhsa,
+                                                   &queue->queue_upload_ring);
+  }
+
   iree_hal_amdgpu_kernarg_ring_deinitialize(queue->libhsa,
                                             &queue->kernarg_ring);
 

diff --git a/runtime/src/iree/hal/drivers/amdgpu/host_queue.h b/runtime/src/iree/hal/drivers/amdgpu/host_queue.h
index 9d90c8a..5fbdcd3 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/host_queue.h
+++ b/runtime/src/iree/hal/drivers/amdgpu/host_queue.h

@@ -24,6 +24,7 @@
 #include "iree/hal/drivers/amdgpu/util/libhsa.h"
 #include "iree/hal/drivers/amdgpu/util/notification_ring.h"
 #include "iree/hal/drivers/amdgpu/util/pm4_capabilities.h"
+#include "iree/hal/drivers/amdgpu/util/queue_upload_ring.h"
 #include "iree/hal/drivers/amdgpu/virtual_queue.h"
 #include "iree/hal/pool.h"
 #include "iree/hal/profile_schema.h"
@@ -172,6 +173,10 @@
   // Per-queue kernarg bump allocator backed by HSA kernarg-init memory.
   iree_hal_amdgpu_kernarg_ring_t kernarg_ring;
 
+  // Optional per-queue upload ring for device-visible control records.
+  // Initialized when a submission path first needs device-side fixup inputs.
+  iree_hal_amdgpu_queue_upload_ring_t queue_upload_ring;
+
   // Optional per-AQL-slot PM4 IB buffer used by PM4-backed wait, transfer, and
   // profiling snippets. This is not an independent scheduling ring: each slot
   // is indexed by the matching AQL packet id and inherits the AQL ring's

diff --git a/runtime/src/iree/hal/drivers/amdgpu/host_queue_submission.c b/runtime/src/iree/hal/drivers/amdgpu/host_queue_submission.c
index 7054100..52dc842 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/host_queue_submission.c
+++ b/runtime/src/iree/hal/drivers/amdgpu/host_queue_submission.c

@@ -151,8 +151,10 @@
 static void iree_hal_amdgpu_host_queue_emit_reclaim_noop_packets(
     iree_hal_amdgpu_host_queue_t* queue,
     iree_hal_amdgpu_reclaim_entry_t* reclaim_entry, uint64_t first_packet_id,
-    uint32_t packet_count, uint64_t kernarg_write_position) {
+    uint32_t packet_count, uint64_t kernarg_write_position,
+    uint64_t queue_upload_write_position) {
   reclaim_entry->kernarg_write_position = kernarg_write_position;
+  reclaim_entry->queue_upload_write_position = queue_upload_write_position;
   reclaim_entry->count = 0;
   iree_hal_amdgpu_notification_ring_advance_epoch(&queue->notification_ring);
   for (uint32_t i = 0; i < packet_count; ++i) {
@@ -547,7 +549,8 @@
     iree_hal_amdgpu_host_queue_kernel_submission_t* submission) {
   iree_hal_amdgpu_host_queue_emit_reclaim_noop_packets(
       queue, submission->reclaim_entry, submission->first_packet_id,
-      submission->packet_count, submission->kernarg_write_position);
+      submission->packet_count, submission->kernarg_write_position,
+      submission->queue_upload_write_position);
   memset(submission, 0, sizeof(*submission));
 }
 
@@ -855,6 +858,8 @@
   }
   submission->reclaim_entry->kernarg_write_position =
       submission->kernarg_write_position;
+  submission->reclaim_entry->queue_upload_write_position =
+      submission->queue_upload_write_position;
   submission->reclaim_entry->count = submission->reclaim_resource_count;
   submission->reclaim_entry->pre_signal_action = submission->pre_signal_action;
   iree_hal_amdgpu_host_queue_merge_barrier_axes(queue, resolution);

diff --git a/runtime/src/iree/hal/drivers/amdgpu/host_queue_submission.h b/runtime/src/iree/hal/drivers/amdgpu/host_queue_submission.h
index 2aebff2..0166543 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/host_queue_submission.h
+++ b/runtime/src/iree/hal/drivers/amdgpu/host_queue_submission.h

@@ -68,6 +68,8 @@
   uint64_t first_packet_id;
   // Kernarg ring write position to reclaim after this submission completes.
   uint64_t kernarg_write_position;
+  // Queue upload ring write position to reclaim after completion.
+  uint64_t queue_upload_write_position;
   // Number of AQL packets reserved starting at |first_packet_id|.
   uint32_t packet_count;
   // Number of valid entries in |reclaim_resources|.

diff --git a/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring.c b/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring.c
index 831a8de..032de7b 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring.c
+++ b/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring.c

@@ -56,6 +56,7 @@
   entry->queue_device_event_count = 0;
   entry->resource_set = NULL;
   entry->kernarg_write_position = 0;
+  entry->queue_upload_write_position = 0;
   entry->count = 0;
   if (count <= IREE_HAL_AMDGPU_RECLAIM_INLINE_CAPACITY) {
     entry->resources = entry->inline_resources;
@@ -107,6 +108,7 @@
   entry->queue_device_event_count = 0;
   entry->resource_set = NULL;
   entry->kernarg_write_position = 0;
+  entry->queue_upload_write_position = 0;
   entry->count = 0;
 }
 
@@ -541,15 +543,15 @@
                                                                   fallback);
 }
 
-iree_host_size_t iree_hal_amdgpu_notification_ring_drain(
+iree_host_size_t iree_hal_amdgpu_notification_ring_drain_reclaim_positions(
     iree_hal_amdgpu_notification_ring_t* ring,
     const iree_async_frontier_t* fallback_frontier,
     iree_hal_amdgpu_reclaim_retire_fn_t retire_fn, void* retire_user_data,
-    uint64_t* out_kernarg_reclaim_position) {
+    iree_hal_amdgpu_reclaim_positions_t* out_reclaim_positions) {
   IREE_ASSERT_ARGUMENT(ring);
-  IREE_ASSERT_ARGUMENT(out_kernarg_reclaim_position);
+  IREE_ASSERT_ARGUMENT(out_reclaim_positions);
 
-  *out_kernarg_reclaim_position = 0;
+  memset(out_reclaim_positions, 0, sizeof(*out_reclaim_positions));
 
   // Early out if the ring was never initialized or already deinitialized.
   if (!ring->epoch.signal.handle) return 0;
@@ -649,6 +651,7 @@
 
   // Release retained resources for all completed epochs.
   uint64_t highest_kernarg_position = 0;
+  uint64_t highest_queue_upload_position = 0;
   for (uint64_t epoch = previous_drained; epoch < current_epoch; ++epoch) {
     uint32_t reclaim_index = (uint32_t)(epoch & (ring->capacity - 1));
     uint64_t kernarg_write_position =
@@ -656,6 +659,11 @@
     if (kernarg_write_position > highest_kernarg_position) {
       highest_kernarg_position = kernarg_write_position;
     }
+    uint64_t queue_upload_write_position =
+        ring->reclaim_entries[reclaim_index].queue_upload_write_position;
+    if (queue_upload_write_position > highest_queue_upload_position) {
+      highest_queue_upload_position = queue_upload_write_position;
+    }
     iree_hal_amdgpu_reclaim_entry_release(&ring->reclaim_entries[reclaim_index],
                                           ring->block_pool);
   }
@@ -665,17 +673,34 @@
   iree_hal_amdgpu_notification_ring_store_position(&ring->read, read,
                                                    iree_memory_order_release);
 
-  *out_kernarg_reclaim_position = highest_kernarg_position;
+  out_reclaim_positions->kernarg_write_position = highest_kernarg_position;
+  out_reclaim_positions->queue_upload_write_position =
+      highest_queue_upload_position;
   return drained_count;
 }
 
-iree_host_size_t iree_hal_amdgpu_notification_ring_fail_all(
-    iree_hal_amdgpu_notification_ring_t* ring, iree_status_t error_status,
+iree_host_size_t iree_hal_amdgpu_notification_ring_drain(
+    iree_hal_amdgpu_notification_ring_t* ring,
+    const iree_async_frontier_t* fallback_frontier,
+    iree_hal_amdgpu_reclaim_retire_fn_t retire_fn, void* retire_user_data,
     uint64_t* out_kernarg_reclaim_position) {
-  IREE_ASSERT_ARGUMENT(ring);
   IREE_ASSERT_ARGUMENT(out_kernarg_reclaim_position);
+  iree_hal_amdgpu_reclaim_positions_t reclaim_positions = {0};
+  iree_host_size_t drained_count =
+      iree_hal_amdgpu_notification_ring_drain_reclaim_positions(
+          ring, fallback_frontier, retire_fn, retire_user_data,
+          &reclaim_positions);
+  *out_kernarg_reclaim_position = reclaim_positions.kernarg_write_position;
+  return drained_count;
+}
 
-  *out_kernarg_reclaim_position = 0;
+iree_host_size_t iree_hal_amdgpu_notification_ring_fail_all_reclaim_positions(
+    iree_hal_amdgpu_notification_ring_t* ring, iree_status_t error_status,
+    iree_hal_amdgpu_reclaim_positions_t* out_reclaim_positions) {
+  IREE_ASSERT_ARGUMENT(ring);
+  IREE_ASSERT_ARGUMENT(out_reclaim_positions);
+
+  memset(out_reclaim_positions, 0, sizeof(*out_reclaim_positions));
 
   iree_host_size_t failed_count = 0;
   uint64_t read = iree_hal_amdgpu_notification_ring_load_position(
@@ -704,6 +729,7 @@
 
   // Release retained resources for all epochs.
   uint64_t highest_kernarg_position = 0;
+  uint64_t highest_queue_upload_position = 0;
   uint64_t last_drained = (uint64_t)iree_atomic_load(&ring->epoch.last_drained,
                                                      iree_memory_order_relaxed);
   for (uint64_t epoch = last_drained; epoch < ring->epoch.next_submission;
@@ -714,6 +740,11 @@
     if (kernarg_write_position > highest_kernarg_position) {
       highest_kernarg_position = kernarg_write_position;
     }
+    uint64_t queue_upload_write_position =
+        ring->reclaim_entries[reclaim_index].queue_upload_write_position;
+    if (queue_upload_write_position > highest_queue_upload_position) {
+      highest_queue_upload_position = queue_upload_write_position;
+    }
     iree_hal_amdgpu_reclaim_entry_execute_pre_signal_action(
         &ring->reclaim_entries[reclaim_index], error_status);
     iree_hal_amdgpu_reclaim_entry_release(&ring->reclaim_entries[reclaim_index],
@@ -726,6 +757,20 @@
   iree_hal_amdgpu_notification_ring_store_position(&ring->read, read,
                                                    iree_memory_order_release);
 
-  *out_kernarg_reclaim_position = highest_kernarg_position;
+  out_reclaim_positions->kernarg_write_position = highest_kernarg_position;
+  out_reclaim_positions->queue_upload_write_position =
+      highest_queue_upload_position;
+  return failed_count;
+}
+
+iree_host_size_t iree_hal_amdgpu_notification_ring_fail_all(
+    iree_hal_amdgpu_notification_ring_t* ring, iree_status_t error_status,
+    uint64_t* out_kernarg_reclaim_position) {
+  IREE_ASSERT_ARGUMENT(out_kernarg_reclaim_position);
+  iree_hal_amdgpu_reclaim_positions_t reclaim_positions = {0};
+  iree_host_size_t failed_count =
+      iree_hal_amdgpu_notification_ring_fail_all_reclaim_positions(
+          ring, error_status, &reclaim_positions);
+  *out_kernarg_reclaim_position = reclaim_positions.kernarg_write_position;
   return failed_count;
 }

diff --git a/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring.h b/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring.h
index 2a0e593..d02119a 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring.h
+++ b/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring.h

@@ -163,6 +163,14 @@
   void* user_data;
 } iree_hal_amdgpu_reclaim_action_t;
 
+// Queue-owned ring positions retired by one or more completed epochs.
+typedef struct iree_hal_amdgpu_reclaim_positions_t {
+  // Highest kernarg ring write position retired by the completed epochs.
+  uint64_t kernarg_write_position;
+  // Highest queue upload ring write position retired by the completed epochs.
+  uint64_t queue_upload_write_position;
+} iree_hal_amdgpu_reclaim_positions_t;
+
 // Optional callback invoked for one completed epoch after pre-signal actions
 // execute and before user-visible semaphore signals publish.
 typedef void(IREE_API_PTR* iree_hal_amdgpu_reclaim_retire_fn_t)(
@@ -196,6 +204,10 @@
   // report the highest position across retired epochs so the caller can reclaim
   // kernarg blocks. 0 means no kernarg was allocated.
   uint64_t kernarg_write_position;
+  // Queue upload ring write position at the time of this submission.
+  // Drain/fail_all report the highest position across retired epochs so the
+  // caller can reclaim upload bytes. 0 means no upload bytes were allocated.
+  uint64_t queue_upload_write_position;
   // Number of dispatch profiling events reserved by this epoch.
   uint32_t profile_event_count;
   // Number of queue device profiling events reserved by this epoch.
@@ -212,8 +224,8 @@
 // Prepares a reclaim entry for |count| resources. If count fits inline,
 // sets |*out_resources| to the entry's inline storage. Otherwise acquires
 // a block from |block_pool| and sets |*out_resources| to point into it.
-// The caller fills the array with retained resource pointers, sets
-// entry->kernarg_write_position, and sets entry->count before advancing the
+// The caller fills the array with retained resource pointers, sets any
+// queue-owned ring write positions, and sets entry->count before advancing the
 // submission epoch.
 iree_status_t iree_hal_amdgpu_reclaim_entry_prepare(
     iree_hal_amdgpu_reclaim_entry_t* entry, iree_arena_block_pool_t* block_pool,
@@ -401,6 +413,20 @@
 // caller has already merged frontier state into the semaphore at submission
 // time and only needs completion-time timeline advancement/untainting.
 //
+// Stores the highest queue-owned ring positions across all retired epochs in
+// |out_reclaim_positions|. Positions are set to 0 if no epochs were retired.
+//
+// |retire_fn|, when provided, is called once per retired epoch before
+// user-visible semaphore publication. It must not publish user-visible
+// completion itself.
+//
+// Returns the number of entries drained.
+iree_host_size_t iree_hal_amdgpu_notification_ring_drain_reclaim_positions(
+    iree_hal_amdgpu_notification_ring_t* ring,
+    const iree_async_frontier_t* fallback_frontier,
+    iree_hal_amdgpu_reclaim_retire_fn_t retire_fn, void* retire_user_data,
+    iree_hal_amdgpu_reclaim_positions_t* out_reclaim_positions);
+
 // Stores the highest kernarg_write_position across all retired epochs in
 // |out_kernarg_reclaim_position|. Set to 0 if no epochs were retired.
 //
@@ -422,6 +448,14 @@
 //
 // |error_status| is borrowed, not consumed — the caller retains ownership.
 //
+// Stores the highest queue-owned ring positions across all failed entries in
+// |out_reclaim_positions| (same semantics as drain_reclaim_positions).
+//
+// Returns the number of entries failed.
+iree_host_size_t iree_hal_amdgpu_notification_ring_fail_all_reclaim_positions(
+    iree_hal_amdgpu_notification_ring_t* ring, iree_status_t error_status,
+    iree_hal_amdgpu_reclaim_positions_t* out_reclaim_positions);
+
 // Stores the highest kernarg_write_position across all failed entries in
 // |out_kernarg_reclaim_position| (same semantics as drain).
 //

diff --git a/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring_test.cc b/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring_test.cc
index 3d166c8..25f5701 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring_test.cc
+++ b/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring_test.cc

@@ -157,10 +157,12 @@
 
   static iree_hal_amdgpu_reclaim_entry_t* ReclaimEntryForNextEpoch(
       iree_hal_amdgpu_notification_ring_t* ring,
-      uint64_t kernarg_write_position = 0) {
+      uint64_t kernarg_write_position = 0,
+      uint64_t queue_upload_write_position = 0) {
     iree_hal_amdgpu_reclaim_entry_t* reclaim_entry =
         iree_hal_amdgpu_notification_ring_reclaim_entry(ring);
     reclaim_entry->kernarg_write_position = kernarg_write_position;
+    reclaim_entry->queue_upload_write_position = queue_upload_write_position;
     return reclaim_entry;
   }
 
@@ -644,6 +646,29 @@
   iree_async_semaphore_release(semaphore);
 }
 
+TEST_F(NotificationRingTest, QueueOwnedReclaimPositionReporting) {
+  IREE_ASSERT_OK_AND_ASSIGN(auto ring, InitializeRing());
+
+  // Epoch 1: no user-visible signals, but both queue-owned rings must retire.
+  ReclaimEntryForNextEpoch(ring.get(), /*kernarg_write_position=*/64,
+                           /*queue_upload_write_position=*/256);
+  EXPECT_EQ(iree_hal_amdgpu_notification_ring_advance_epoch(ring.get()), 1u);
+
+  // Epoch 2: later kernargs but an earlier upload watermark.
+  ReclaimEntryForNextEpoch(ring.get(), /*kernarg_write_position=*/192,
+                           /*queue_upload_write_position=*/128);
+  EXPECT_EQ(iree_hal_amdgpu_notification_ring_advance_epoch(ring.get()), 2u);
+
+  SimulateCompletions(ring.get(), 2);
+  iree_hal_amdgpu_reclaim_positions_t reclaim_positions = {0};
+  EXPECT_EQ(
+      iree_hal_amdgpu_notification_ring_drain_reclaim_positions(
+          ring.get(), &kEmptyFrontier, nullptr, nullptr, &reclaim_positions),
+      0u);
+  EXPECT_EQ(reclaim_positions.kernarg_write_position, 192u);
+  EXPECT_EQ(reclaim_positions.queue_upload_write_position, 256u);
+}
+
 TEST_F(NotificationRingTest, KernargPositionReportingForZeroSignalEpochs) {
   IREE_ASSERT_OK_AND_ASSIGN(auto ring, InitializeRing());