Rollup of minor runtime fixes/cleanup from the AMDGPU branch. (#19621)

diff --git a/build_tools/cmake/iree_bitcode_library.cmake b/build_tools/cmake/iree_bitcode_library.cmake
index 4ae800d..63b0d03 100644
--- a/build_tools/cmake/iree_bitcode_library.cmake
+++ b/build_tools/cmake/iree_bitcode_library.cmake
@@ -62,7 +62,7 @@
     # Enable inline asm.
     "-fasm"
 
-    # Object file only in bitcode format:
+    # Object file only in bitcode format.
     "-c"
     "-emit-llvm"
 
@@ -175,7 +175,7 @@
     # Optimized and unstamped.
     "-O3"
 
-    # Object file only in bitcode format:
+    # Object file only in bitcode format.
     "-c"
     "-emit-llvm"
   )
@@ -239,7 +239,6 @@
 #       supported.
 # COPTS: Additional flags to pass to clang.
 # OUT: Output file name. Defaults to {source.c}.{gpu_arch}.bc.
-#
 function(iree_amdgpu_bitcode_library)
   cmake_parse_arguments(
     _RULE
diff --git a/build_tools/cmake/iree_hal_cts_test_suite.cmake b/build_tools/cmake/iree_hal_cts_test_suite.cmake
index ce229cb..7780922 100644
--- a/build_tools/cmake/iree_hal_cts_test_suite.cmake
+++ b/build_tools/cmake/iree_hal_cts_test_suite.cmake
@@ -19,10 +19,12 @@
 #       other parameters.
 #   DRIVER_REGISTRATION_HDR: The C #include path for `DRIVER_REGISTRATION_FN`.
 #   DRIVER_REGISTRATION_FN: The C function which registers `DRIVER_NAME`.
-#   COMPILER_TARGET_BACKEND: Optional target backend name to pass to the
-#       `-iree-hal-target-backends` option of `iree-compile` to use for
+#   COMPILER_TARGET_BACKEND: Optional target backend name used for
 #       executable generation. If this is omitted, or the associated compiler
 #       target is not enabled, tests which use executables will be disabled.
+#   COMPILER_TARGET_DEVICE: Optional target device name to pass to the
+#       `--iree-hal-target-device` option of `iree-compile` used for
+#       executable generation. If omitted the target backend name will be used.
 #   COMPILER_FLAGS: Additional compiler flags.
 #       Example: "--iree-llvmcpu-target-float-abi=hard --iree-llvmcpu-loop-unrolling"
 #   EXECUTABLE_FORMAT: Executable format identifier. Will be interpreted
@@ -47,7 +49,7 @@
   cmake_parse_arguments(
     _RULE
     ""
-    "DRIVER_NAME;VARIANT_SUFFIX;DRIVER_REGISTRATION_HDR;DRIVER_REGISTRATION_FN;COMPILER_TARGET_BACKEND;EXECUTABLE_FORMAT"
+    "DRIVER_NAME;VARIANT_SUFFIX;DRIVER_REGISTRATION_HDR;DRIVER_REGISTRATION_FN;COMPILER_TARGET_BACKEND;COMPILER_TARGET_DEVICE;EXECUTABLE_FORMAT"
     "DEPS;ARGS;COMPILER_FLAGS;INCLUDED_TESTS;EXCLUDED_TESTS;LABELS;"
     ${ARGN}
   )
@@ -84,10 +86,15 @@
 
     set(_TRANSLATE_FLAGS
       "--compile-mode=hal-executable"
-      "--iree-hal-target-backends=${_RULE_COMPILER_TARGET_BACKEND}"
       ${_RULE_COMPILER_FLAGS}
     )
 
+    if(DEFINED _RULE_COMPILER_TARGET_DEVICE)
+      list(APPEND _TRANSLATE_FLAGS "--iree-hal-target-device=${_RULE_COMPILER_TARGET_DEVICE}")
+    else()
+      list(APPEND _TRANSLATE_FLAGS "--iree-hal-target-backends=${_RULE_COMPILER_TARGET_BACKEND}")
+    endif()
+
     # Skip if already created (multiple suites using the same compiler setting).
     iree_package_name(_PACKAGE_NAME)
     if(NOT TARGET ${_PACKAGE_NAME}_${_EXECUTABLES_TESTDATA_NAME}_c)
@@ -183,6 +190,7 @@
     set(IREE_CTS_DRIVER_REGISTRATION_FN "${_RULE_DRIVER_REGISTRATION_FN}")
     set(IREE_CTS_DRIVER_NAME "${_RULE_DRIVER_NAME}")
     set(IREE_CTS_TARGET_BACKEND "${_RULE_COMPILER_TARGET_BACKEND}")
+    set(IREE_CTS_TARGET_DEVICE "${_RULE_COMPILER_TARGET_DEVICE}")
 
     configure_file(
       "${IREE_ROOT_DIR}/runtime/src/iree/hal/cts/cts_test_template.cc.in"
diff --git a/compiler/plugins/target/ROCM/ROCMTarget.cpp b/compiler/plugins/target/ROCM/ROCMTarget.cpp
index 851b6ed..8558f0b 100644
--- a/compiler/plugins/target/ROCM/ROCMTarget.cpp
+++ b/compiler/plugins/target/ROCM/ROCMTarget.cpp
@@ -665,7 +665,9 @@
       }
       int64_t ordinal = ordinalAttr.getInt();
 
-      auto symbolNameRef = builder.createString(exportOp.getName());
+      // Symbol names include a `.kd` suffix as that's what HSA expects.
+      auto symbolNameKd = (exportOp.getName() + ".kd").str();
+      auto symbolNameRef = builder.createString(symbolNameKd);
 
       iree_hal_amdgpu_Dims_t workgroupSize = {0};
       if (auto workgroupSizeAttr = exportOp.getWorkgroupSize()) {
diff --git a/runtime/src/iree/base/allocator.h b/runtime/src/iree/base/allocator.h
index 480e9ad..0d5f7a9 100644
--- a/runtime/src/iree/base/allocator.h
+++ b/runtime/src/iree/base/allocator.h
@@ -97,6 +97,18 @@
   return iree_make_byte_span((uint8_t*)span.data, span.data_length);
 }
 
+// Copies |size| bytes from |src| to |dst| without polluting the cache with
+// |dst| lines. Used when streaming data that will not be read again.
+static inline void iree_memcpy_stream_dst(void* IREE_RESTRICT dst,
+                                          const void* IREE_RESTRICT src,
+                                          iree_host_size_t size) {
+  // TODO(benvanik): implement a proper non-temporal copy. This will be
+  // architecture-specific and may have compiler-specific paths in order to emit
+  // the proper instructions. On x64 this should be using MOVNTDQ (or something
+  // in that family).
+  memcpy(dst, src, size);
+}
+
 //===----------------------------------------------------------------------===//
 // Totally shady stack allocation
 //===----------------------------------------------------------------------===//
diff --git a/runtime/src/iree/base/assert.h b/runtime/src/iree/base/assert.h
index f6fd83e..0d6b4eb 100644
--- a/runtime/src/iree/base/assert.h
+++ b/runtime/src/iree/base/assert.h
@@ -57,7 +57,7 @@
 
 // Assertions enabled:
 
-#define IREE_ASSERT(condition, ...) assert(condition)
+#define IREE_ASSERT(condition, ...) assert(IREE_UNLIKELY(condition))
 
 // TODO(#2843): better logging of status assertions.
 // #define IREE_ASSERT_OK(status) IREE_ASSERT(iree_status_is_ok(status))
diff --git a/runtime/src/iree/base/internal/arena.c b/runtime/src/iree/base/internal/arena.c
index ba8f9e8..17f361d 100644
--- a/runtime/src/iree/base/internal/arena.c
+++ b/runtime/src/iree/base/internal/arena.c
@@ -41,6 +41,27 @@
   IREE_TRACE_ZONE_END(z0);
 }
 
+iree_status_t iree_arena_block_pool_preallocate(
+    iree_arena_block_pool_t* block_pool, iree_host_size_t count) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, count);
+
+  for (iree_host_size_t i = 0; i < count; ++i) {
+    uint8_t* block_base = NULL;
+    IREE_RETURN_AND_END_ZONE_IF_ERROR(
+        z0, iree_allocator_malloc_uninitialized(block_pool->block_allocator,
+                                                block_pool->total_block_size,
+                                                (void**)&block_base));
+    iree_arena_block_t* block =
+        iree_arena_block_trailer(block_pool, block_base);
+    iree_atomic_arena_block_slist_concat(&block_pool->available_slist, block,
+                                         block);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
 void iree_arena_block_pool_trim(iree_arena_block_pool_t* block_pool) {
   IREE_TRACE_ZONE_BEGIN(z0);
 
@@ -125,6 +146,7 @@
     } while (head);
     arena->allocation_head = NULL;
   }
+
   if (arena->block_head != NULL) {
 #if defined(IREE_SANITIZER_ADDRESS)
     iree_arena_block_t* block = arena->block_head;
@@ -141,6 +163,10 @@
     arena->block_tail = NULL;
   }
 
+  arena->total_allocation_size = 0;
+  arena->used_allocation_size = 0;
+  arena->block_bytes_remaining = 0;
+
   IREE_TRACE_ZONE_END(z0);
 }
 
diff --git a/runtime/src/iree/base/internal/arena.h b/runtime/src/iree/base/internal/arena.h
index 199e3fc..ad252b5 100644
--- a/runtime/src/iree/base/internal/arena.h
+++ b/runtime/src/iree/base/internal/arena.h
@@ -77,6 +77,10 @@
 // back to it.
 void iree_arena_block_pool_deinitialize(iree_arena_block_pool_t* block_pool);
 
+// Preallocates |count| blocks and adds them to the pool free list.
+iree_status_t iree_arena_block_pool_preallocate(
+    iree_arena_block_pool_t* block_pool, iree_host_size_t count);
+
 // Trims the pool by freeing unused blocks back to the allocator.
 // Acquired blocks are not freed and remain valid.
 void iree_arena_block_pool_trim(iree_arena_block_pool_t* block_pool);
diff --git a/runtime/src/iree/base/string_view.h b/runtime/src/iree/base/string_view.h
index 01e243c..048b468 100644
--- a/runtime/src/iree/base/string_view.h
+++ b/runtime/src/iree/base/string_view.h
@@ -89,6 +89,20 @@
   return v;
 }
 
+// A list of string key-value pairs.
+typedef struct iree_string_pair_list_t {
+  // Total number of pairs in the list.
+  iree_host_size_t count;
+  // Value list or NULL if no values.
+  const iree_string_pair_t* pairs;
+} iree_string_pair_list_t;
+
+// Returns an empty string pair list.
+static inline iree_string_pair_list_t iree_string_pair_list_empty(void) {
+  iree_string_pair_list_t v = {0, NULL};
+  return v;
+}
+
 #define iree_string_view_literal(str) \
   { .data = (str), .size = IREE_ARRAYSIZE(str) - 1 }
 
@@ -106,6 +120,12 @@
   const iree_string_view_t* values;
 } iree_string_view_list_t;
 
+// Returns an empty string list.
+static inline iree_string_view_list_t iree_string_view_list_empty(void) {
+  iree_string_view_list_t v = {0, NULL};
+  return v;
+}
+
 // Returns true if the two strings are equal (compare == 0).
 IREE_API_EXPORT bool iree_string_view_equal(iree_string_view_t lhs,
                                             iree_string_view_t rhs);
diff --git a/runtime/src/iree/base/time.h b/runtime/src/iree/base/time.h
index 89cad70..aaaa85e 100644
--- a/runtime/src/iree/base/time.h
+++ b/runtime/src/iree/base/time.h
@@ -173,6 +173,14 @@
              : iree_relative_timeout_to_deadline_ns(timeout.nanos);
 }
 
+// Returns a relative timeout duration in nanoseconds from the given timeout.
+static inline iree_duration_t iree_timeout_as_duration_ns(
+    iree_timeout_t timeout) {
+  return timeout.type == IREE_TIMEOUT_ABSOLUTE
+             ? iree_absolute_deadline_to_timeout_ns(timeout.nanos)
+             : timeout.nanos;
+}
+
 // Returns the earliest timeout between |lhs| and |rhs|.
 static inline iree_timeout_t iree_timeout_min(iree_timeout_t lhs,
                                               iree_timeout_t rhs) {
diff --git a/runtime/src/iree/base/tracing.h b/runtime/src/iree/base/tracing.h
index e0c339a..c0fbed9 100644
--- a/runtime/src/iree/base/tracing.h
+++ b/runtime/src/iree/base/tracing.h
@@ -165,14 +165,14 @@
 //===----------------------------------------------------------------------===//
 
 // Matches Tracy's PlotFormatType enum.
-enum {
+typedef enum {
   // Values will be displayed as plain numbers.
   IREE_TRACING_PLOT_TYPE_NUMBER = 0,
   // Treats the values as memory sizes. Will display kilobytes, megabytes, etc.
   IREE_TRACING_PLOT_TYPE_MEMORY = 1,
   // Values will be displayed as percentage with value 100 being equal to 100%.
   IREE_TRACING_PLOT_TYPE_PERCENTAGE = 2,
-};
+} iree_tracing_plot_type_t;
 
 // Colors used for messages based on the level provided to the macro.
 enum {
diff --git a/runtime/src/iree/hal/command_buffer.h b/runtime/src/iree/hal/command_buffer.h
index e6523f2..82fdd61 100644
--- a/runtime/src/iree/hal/command_buffer.h
+++ b/runtime/src/iree/hal/command_buffer.h
@@ -443,10 +443,15 @@
 
 // An RGBA color.
 typedef struct iree_hal_label_color_t {
-  uint8_t r;
-  uint8_t g;
-  uint8_t b;
-  uint8_t a;
+  union {
+    struct {
+      uint8_t r;
+      uint8_t g;
+      uint8_t b;
+      uint8_t a;
+    };
+    uint32_t value;
+  };
 } iree_hal_label_color_t;
 
 // A source location attached to debug labels.
@@ -457,7 +462,7 @@
 
 // An unspecified color; debugging tools are to choose their own.
 static inline iree_hal_label_color_t iree_hal_label_color_unspecified() {
-  iree_hal_label_color_t color = {0, 0, 0, 0};
+  iree_hal_label_color_t color = {{{0, 0, 0, 0}}};
   return color;
 }
 
diff --git a/runtime/src/iree/hal/cts/cts_test_template.cc.in b/runtime/src/iree/hal/cts/cts_test_template.cc.in
index 15fb53f..15ad6cc 100644
--- a/runtime/src/iree/hal/cts/cts_test_template.cc.in
+++ b/runtime/src/iree/hal/cts/cts_test_template.cc.in
@@ -12,6 +12,7 @@
 #cmakedefine IREE_CTS_EXECUTABLE_FORMAT @IREE_CTS_EXECUTABLE_FORMAT@
 #cmakedefine IREE_CTS_EXECUTABLES_TESTDATA_HDR "@IREE_CTS_EXECUTABLES_TESTDATA_HDR@"
 #cmakedefine IREE_CTS_TARGET_BACKEND "@IREE_CTS_TARGET_BACKEND@"
+#cmakedefine IREE_CTS_TARGET_DEVICE "@IREE_CTS_TARGET_DEVICE@"
 
 // clang-format on
 
diff --git a/runtime/src/iree/hal/cts/driver_test.h b/runtime/src/iree/hal/cts/driver_test.h
index 515244f..5c276c0 100644
--- a/runtime/src/iree/hal/cts/driver_test.h
+++ b/runtime/src/iree/hal/cts/driver_test.h
@@ -139,15 +139,17 @@
   std::cout << "Driver has " << device_info_count << " device(s)\n";
   if (device_info_count == 0) GTEST_SKIP() << "No available devices";
 
-  // Check creation via empty path.
-  iree_string_view_t name = device_infos[0].name;
-  CheckCreateDeviceViaPath(device_infos[0].name, iree_string_view_empty());
-
-  // Check creation via index path.
+  // Check creation via explicit path.
+  bool tested_empty_path = false;
   for (iree_host_size_t i = 0; i < device_info_count; ++i) {
-    char index[8];
-    snprintf(index, 8, "%d", i);
-    CheckCreateDeviceViaPath(device_infos[i].name, IREE_SV(index));
+    tested_empty_path |= iree_string_view_is_empty(device_infos[i].path);
+    CheckCreateDeviceViaPath(device_infos[i].name, device_infos[i].path);
+  }
+
+  // Check creation via empty path if we didn't already.
+  if (!tested_empty_path) {
+    iree_string_view_t name = device_infos[0].name;
+    CheckCreateDeviceViaPath(device_infos[0].name, iree_string_view_empty());
   }
 
   iree_allocator_free(iree_allocator_system(), device_infos);
diff --git a/runtime/src/iree/hal/drivers/null/api.h b/runtime/src/iree/hal/drivers/null/api.h
index ad47173..2ae6ecd 100644
--- a/runtime/src/iree/hal/drivers/null/api.h
+++ b/runtime/src/iree/hal/drivers/null/api.h
@@ -27,11 +27,11 @@
   int reserved;
 } iree_hal_null_device_options_t;
 
-// Initializes |out_params| to default values.
+// Initializes |out_options| to default values.
 IREE_API_EXPORT void iree_hal_null_device_options_initialize(
-    iree_hal_null_device_options_t* out_params);
+    iree_hal_null_device_options_t* out_options);
 
-// Creates a {Null} HAL device with the given |params|.
+// Creates a {Null} HAL device with the given |options|.
 //
 // The provided |identifier| will be used by programs to distinguish the device
 // type from other HAL implementations. If compiling programs with the IREE
diff --git a/runtime/src/iree/hal/drivers/null/device.c b/runtime/src/iree/hal/drivers/null/device.c
index d91c23e..937b213 100644
--- a/runtime/src/iree/hal/drivers/null/device.c
+++ b/runtime/src/iree/hal/drivers/null/device.c
@@ -18,6 +18,26 @@
 #include "iree/hal/utils/file_transfer.h"
 
 //===----------------------------------------------------------------------===//
+// iree_hal_null_device_options_t
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT void iree_hal_null_device_options_initialize(
+    iree_hal_null_device_options_t* out_options) {
+  memset(out_options, 0, sizeof(*out_options));
+  // TODO(null): set defaults based on compiler configuration. Flags should not
+  // be used as multiple devices may be configured within the process or the
+  // hosting application may be authored in python/etc that does not use a flags
+  // mechanism accessible here.
+}
+
+static iree_status_t iree_hal_null_device_options_verify(
+    const iree_hal_null_device_options_t* options) {
+  // TODO(null): verify that the parameters are within expected ranges and any
+  // requested features are supported.
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
 // iree_hal_null_device_t
 //===----------------------------------------------------------------------===//
 
@@ -42,22 +62,6 @@
   return (iree_hal_null_device_t*)base_value;
 }
 
-void iree_hal_null_device_options_initialize(
-    iree_hal_null_device_options_t* out_options) {
-  memset(out_options, 0, sizeof(*out_options));
-  // TODO(null): set defaults based on compiler configuration. Flags should not
-  // be used as multiple devices may be configured within the process or the
-  // hosting application may be authored in python/etc that does not use a flags
-  // mechanism accessible here.
-}
-
-static iree_status_t iree_hal_null_device_options_verify(
-    const iree_hal_null_device_options_t* options) {
-  // TODO(null): verify that the parameters are within expected ranges and any
-  // requested features are supported.
-  return iree_ok_status();
-}
-
 iree_status_t iree_hal_null_device_create(
     iree_string_view_t identifier,
     const iree_hal_null_device_options_t* options,
diff --git a/runtime/src/iree/hal/drivers/null/driver.c b/runtime/src/iree/hal/drivers/null/driver.c
index 78cf511..d9fd2a3 100644
--- a/runtime/src/iree/hal/drivers/null/driver.c
+++ b/runtime/src/iree/hal/drivers/null/driver.c
@@ -9,6 +9,30 @@
 #include "iree/hal/drivers/null/api.h"
 
 //===----------------------------------------------------------------------===//
+// iree_hal_null_driver_options_t
+//===----------------------------------------------------------------------===//
+
+IREE_API_EXPORT void iree_hal_null_driver_options_initialize(
+    iree_hal_null_driver_options_t* out_options) {
+  memset(out_options, 0, sizeof(*out_options));
+
+  // TODO(null): set defaults based on compiler configuration. Flags should not
+  // be used as multiple devices may be configured within the process or the
+  // hosting application may be authored in python/etc that does not use a flags
+  // mechanism accessible here.
+
+  iree_hal_null_device_options_initialize(&out_options->default_device_options);
+}
+
+static iree_status_t iree_hal_null_driver_options_verify(
+    const iree_hal_null_driver_options_t* options) {
+  // TODO(null): verify that the parameters are within expected ranges and any
+  // requested features are supported.
+
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
 // iree_hal_null_driver_t
 //===----------------------------------------------------------------------===//
 
@@ -35,26 +59,6 @@
   return (iree_hal_null_driver_t*)base_value;
 }
 
-void iree_hal_null_driver_options_initialize(
-    iree_hal_null_driver_options_t* out_options) {
-  memset(out_options, 0, sizeof(*out_options));
-
-  // TODO(null): set defaults based on compiler configuration. Flags should not
-  // be used as multiple devices may be configured within the process or the
-  // hosting application may be authored in python/etc that does not use a flags
-  // mechanism accessible here.
-
-  iree_hal_null_device_options_initialize(&out_options->default_device_options);
-}
-
-static iree_status_t iree_hal_null_driver_options_verify(
-    const iree_hal_null_driver_options_t* options) {
-  // TODO(null): verify that the parameters are within expected ranges and any
-  // requested features are supported.
-
-  return iree_ok_status();
-}
-
 IREE_API_EXPORT iree_status_t iree_hal_null_driver_create(
     iree_string_view_t identifier,
     const iree_hal_null_driver_options_t* options,
diff --git a/runtime/src/iree/hal/queue.h b/runtime/src/iree/hal/queue.h
index 4e54e0b..b52627e 100644
--- a/runtime/src/iree/hal/queue.h
+++ b/runtime/src/iree/hal/queue.h
@@ -36,6 +36,60 @@
 #define IREE_HAL_QUEUE_AFFINITY_ANY ((iree_hal_queue_affinity_t)(-1))
 #define IREE_HAL_MAX_QUEUES (sizeof(iree_hal_queue_affinity_t) / 8)
 
+// Returns true if the |queue_affinity| is empty (none specified).
+#define iree_hal_queue_affinity_is_empty(queue_affinity) ((queue_affinity) == 0)
+
+// Returns true if the |queue_affinity| is indicating any/all queues.
+#define iree_hal_queue_affinity_is_any(queue_affinity) \
+  ((queue_affinity) == IREE_HAL_QUEUE_AFFINITY_ANY)
+
+// Returns the total number of queues specified in the |queue_affinity| mask.
+#define iree_hal_queue_affinity_count(queue_affinity) \
+  iree_math_count_ones_u64(queue_affinity)
+
+// Returns the index of the first set bit in |queue_affinity|.
+// Requires that at least one bit be set.
+#define iree_hal_queue_affinity_find_first_set(queue_affinity) \
+  iree_math_count_trailing_zeros_u64(queue_affinity)
+
+// Logically shifts the queue affinity to the right by the given amount.
+#define iree_hal_queue_affinity_shr(queue_affinity, amount) \
+  iree_shr((queue_affinity), (amount))
+
+// Updates |inout_affinity| to only include those bits set in |mask_affinity|.
+#define iree_hal_queue_affinity_and_into(inout_affinity, mask_affinity) \
+  (inout_affinity) = ((inout_affinity) & (mask_affinity))
+
+// Updates |inout_affinity| to include bits set in |mask_affinity|.
+#define iree_hal_queue_affinity_or_into(inout_affinity, mask_affinity) \
+  (inout_affinity) = ((inout_affinity) | (mask_affinity))
+
+// Loops over each queue in the given |queue_affinity| bitmap.
+//
+// The following variables are available within the loop:
+//     queue_count: total number of queues used
+//     queue_index: loop index (0 to queue_count)
+//   queue_ordinal: queue ordinal (0 to the total number of queues)
+//
+// Example:
+//  IREE_HAL_FOR_QUEUE_AFFINITY(my_queue_affinity) {
+//    compact_queue_list[queue_index];     // 0 to my_queue_affinity count
+//    full_queue_list[queue_ordinal];      // 0 to available queues
+//  }
+#define IREE_HAL_FOR_QUEUE_AFFINITY(queue_affinity)                           \
+  iree_hal_queue_affinity_t _queue_bits = (queue_affinity);                   \
+  for (int queue_index = 0, _queue_ordinal_base = 0,                          \
+           queue_count = iree_hal_queue_affinity_count(_queue_bits),          \
+           _bit_offset = 0,                                                   \
+           queue_ordinal =                                                    \
+               iree_hal_queue_affinity_find_first_set(_queue_bits);           \
+       queue_index < queue_count;                                             \
+       ++queue_index, _queue_ordinal_base += _bit_offset + 1,                 \
+           _queue_bits =                                                      \
+               iree_hal_queue_affinity_shr(_queue_bits, _bit_offset + 1),     \
+           _bit_offset = iree_hal_queue_affinity_find_first_set(_queue_bits), \
+           queue_ordinal = _queue_ordinal_base + _bit_offset)
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/runtime/src/iree/hal/semaphore.h b/runtime/src/iree/hal/semaphore.h
index 52571ed..5320666 100644
--- a/runtime/src/iree/hal/semaphore.h
+++ b/runtime/src/iree/hal/semaphore.h
@@ -102,7 +102,8 @@
 }
 
 // Frees an iree_status_t encoded in a semaphore |value|, if any.
-static inline void iree_hal_semaphore_failure_free(uint64_t value) {
+IREE_ATTRIBUTE_ALWAYS_INLINE static inline void iree_hal_semaphore_failure_free(
+    uint64_t value) {
   if (value & IREE_HAL_SEMAPHORE_FAILURE_VALUE_STATUS_BIT) {
     iree_status_free((iree_status_t)(((int64_t)value << 1) >> 1));
   }
diff --git a/runtime/src/iree/hal/utils/fd_file.c b/runtime/src/iree/hal/utils/fd_file.c
index 6941e2e..a502c1b 100644
--- a/runtime/src/iree/hal/utils/fd_file.c
+++ b/runtime/src/iree/hal/utils/fd_file.c
@@ -223,17 +223,16 @@
   // Verify that the requested access can be satisfied.
   if (iree_all_bits_set(access, IREE_HAL_MEMORY_ACCESS_READ) &&
       !iree_all_bits_set(allowed_access, IREE_HAL_MEMORY_ACCESS_READ)) {
-    IREE_RETURN_AND_END_ZONE_IF_ERROR(
-        z0,
-        iree_make_status(
-            IREE_STATUS_PERMISSION_DENIED,
-            "read access requested on a file descriptor that is not readable"));
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(
+        IREE_STATUS_PERMISSION_DENIED,
+        "read access requested on a file descriptor that is not readable");
   } else if (iree_all_bits_set(access, IREE_HAL_MEMORY_ACCESS_WRITE) &&
              !iree_all_bits_set(allowed_access, IREE_HAL_MEMORY_ACCESS_WRITE)) {
-    IREE_RETURN_AND_END_ZONE_IF_ERROR(
-        z0, iree_make_status(IREE_STATUS_PERMISSION_DENIED,
-                             "write access requested on a file descriptor that "
-                             "is not writable"));
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_PERMISSION_DENIED,
+                            "write access requested on a file descriptor that "
+                            "is not writable");
   }
 
   // Allocate object that retains the underlying file handle and our opened
diff --git a/runtime/src/iree/io/formats/irpa/irpa_builder.c b/runtime/src/iree/io/formats/irpa/irpa_builder.c
index 330691f..e1459e8 100644
--- a/runtime/src/iree/io/formats/irpa/irpa_builder.c
+++ b/runtime/src/iree/io/formats/irpa/irpa_builder.c
@@ -189,10 +189,10 @@
         break;
       }
       default: {
-        IREE_RETURN_AND_END_ZONE_IF_ERROR(
-            z0, iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
-                                 "unhandled entry type %d",
-                                 (int)source_entry->type));
+        IREE_TRACE_ZONE_END(z0);
+        return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                                "unhandled entry type %d",
+                                (int)source_entry->type);
       }
     }
 
diff --git a/runtime/src/iree/io/memory_stream.c b/runtime/src/iree/io/memory_stream.c
index 84043c0..f16d300 100644
--- a/runtime/src/iree/io/memory_stream.c
+++ b/runtime/src/iree/io/memory_stream.c
@@ -199,15 +199,14 @@
       z0, iree_io_stream_validate_fixed_range(stream->offset, stream->length,
                                               buffer_capacity, &read_length));
   if (!out_buffer_length && read_length != buffer_capacity) {
-    IREE_RETURN_AND_END_ZONE_IF_ERROR(
-        z0,
-        iree_make_status(IREE_STATUS_OUT_OF_RANGE,
-                         "read of range [%" PRIu64 ", %" PRIu64 ") (%" PRIu64
-                         " bytes) out of range; stream offset %" PRIu64
-                         " and length %" PRIu64 " insufficient",
-                         stream->offset, stream->offset + buffer_capacity,
-                         (iree_io_stream_pos_t)buffer_capacity, stream->offset,
-                         stream->length));
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
+                            "read of range [%" PRIu64 ", %" PRIu64 ") (%" PRIu64
+                            " bytes) out of range; stream offset %" PRIu64
+                            " and length %" PRIu64 " insufficient",
+                            stream->offset, stream->offset + buffer_capacity,
+                            (iree_io_stream_pos_t)buffer_capacity,
+                            stream->offset, stream->length);
   }
 
   memcpy(buffer, stream->contents + stream->offset,
diff --git a/runtime/src/iree/schemas/amdgpu_executable_def.fbs b/runtime/src/iree/schemas/amdgpu_executable_def.fbs
index 43efdb0..7c0510c 100644
--- a/runtime/src/iree/schemas/amdgpu_executable_def.fbs
+++ b/runtime/src/iree/schemas/amdgpu_executable_def.fbs
@@ -28,6 +28,7 @@
 // Information about an exported function on the executable.
 table ExportDef {
   // String name of the exported function symbol in the module.
+  // Includes a `.kd` suffix as that's what HSA expects.
   symbol_name:string;
 
   // Workgroup size for the export.
diff --git a/runtime/src/iree/tooling/context_util.c b/runtime/src/iree/tooling/context_util.c
index f088ee8..8dbd959 100644
--- a/runtime/src/iree/tooling/context_util.c
+++ b/runtime/src/iree/tooling/context_util.c
@@ -66,10 +66,10 @@
     } else if (strcmp(FLAG_module_mode, "preload") == 0) {
       read_flags |= IREE_FILE_READ_FLAG_PRELOAD;
     } else {
-      IREE_RETURN_AND_END_ZONE_IF_ERROR(
-          z0, iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
-                               "unrecognized --module_mode= value '%s'",
-                               FLAG_module_mode));
+      IREE_TRACE_ZONE_END(z0);
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "unrecognized --module_mode= value '%s'",
+                              FLAG_module_mode);
     }
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_file_read_contents(path_str, read_flags, host_allocator,