Updating VMVX to use per-worker contexts and support workgroup state.
Previously one context was shared for all workers and that made it
impossible to safely support rwdata inside the modules. Now that each
worker has a dedicated context the VMVX module can have workgroup
information directly stored on it during execution and available for
use by anything VMVX uses to run work. This does have a memory cost
when multithreading but it's on the order of ~128B/worker. This required
plumbing the expected worker count through local executables but that's
likely to be useful with other loaders in the future (wasm/etc).
diff --git a/runtime/src/iree/hal/drivers/local_sync/sync_device.c b/runtime/src/iree/hal/drivers/local_sync/sync_device.c
index 89cd155..633df84 100644
--- a/runtime/src/iree/hal/drivers/local_sync/sync_device.c
+++ b/runtime/src/iree/hal/drivers/local_sync/sync_device.c
@@ -231,7 +231,7 @@
     iree_loop_t loop, iree_hal_executable_cache_t** out_executable_cache) {
   iree_hal_sync_device_t* device = iree_hal_sync_device_cast(base_device);
   return iree_hal_local_executable_cache_create(
-      identifier, device->loader_count, device->loaders,
+      identifier, /*worker_capacity=*/1, device->loader_count, device->loaders,
       iree_hal_device_host_allocator(base_device), out_executable_cache);
 }
 
diff --git a/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c b/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c
index da2f628..da96b12 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_command_buffer.c
@@ -848,7 +848,8 @@
           .local_memory_size = (size_t)tile_context->local_memory.data_length,
       };
   iree_status_t status = iree_hal_local_executable_issue_call(
-      cmd->executable, cmd->ordinal, &dispatch_state, &workgroup_state);
+      cmd->executable, cmd->ordinal, &dispatch_state, &workgroup_state,
+      tile_context->worker_id);
 
   IREE_TRACE_ZONE_END(z0);
   return status;
diff --git a/runtime/src/iree/hal/drivers/local_task/task_device.c b/runtime/src/iree/hal/drivers/local_task/task_device.c
index 27daf97..eade286 100644
--- a/runtime/src/iree/hal/drivers/local_task/task_device.c
+++ b/runtime/src/iree/hal/drivers/local_task/task_device.c
@@ -272,7 +272,8 @@
     iree_loop_t loop, iree_hal_executable_cache_t** out_executable_cache) {
   iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
   return iree_hal_local_executable_cache_create(
-      identifier, device->loader_count, device->loaders,
+      identifier, iree_task_executor_worker_count(device->executor),
+      device->loader_count, device->loaders,
       iree_hal_device_host_allocator(base_device), out_executable_cache);
 }
 
diff --git a/runtime/src/iree/hal/local/executable_library_benchmark.c b/runtime/src/iree/hal/local/executable_library_benchmark.c
index 5ee322f..f08dfa2 100644
--- a/runtime/src/iree/hal/local/executable_library_benchmark.c
+++ b/runtime/src/iree/hal/local/executable_library_benchmark.c
@@ -171,8 +171,9 @@
   // Perform the load, which will fail if the executable cannot be loaded or
   // there was an issue with the layouts.
   iree_hal_executable_t* executable = NULL;
-  IREE_RETURN_IF_ERROR(iree_hal_executable_loader_try_load(
-      executable_loader, &executable_params, &executable));
+  IREE_RETURN_IF_ERROR(
+      iree_hal_executable_loader_try_load(executable_loader, &executable_params,
+                                          /*worker_capacity=*/1, &executable));
   iree_hal_local_executable_t* local_executable =
       iree_hal_local_executable_cast(executable);
 
diff --git a/runtime/src/iree/hal/local/executable_loader.c b/runtime/src/iree/hal/local/executable_loader.c
index a159cc0..955572b 100644
--- a/runtime/src/iree/hal/local/executable_loader.c
+++ b/runtime/src/iree/hal/local/executable_loader.c
@@ -87,7 +87,7 @@
 iree_status_t iree_hal_executable_loader_try_load(
     iree_hal_executable_loader_t* executable_loader,
     const iree_hal_executable_params_t* executable_params,
-    iree_hal_executable_t** out_executable) {
+    iree_host_size_t worker_capacity, iree_hal_executable_t** out_executable) {
   IREE_ASSERT_ARGUMENT(executable_loader);
   IREE_ASSERT_ARGUMENT(executable_params);
   IREE_ASSERT_ARGUMENT(!executable_params->pipeline_layout_count ||
@@ -95,6 +95,6 @@
   IREE_ASSERT_ARGUMENT(!executable_params->executable_data.data_length ||
                        executable_params->executable_data.data);
   IREE_ASSERT_ARGUMENT(out_executable);
-  return executable_loader->vtable->try_load(executable_loader,
-                                             executable_params, out_executable);
+  return executable_loader->vtable->try_load(
+      executable_loader, executable_params, worker_capacity, out_executable);
 }
diff --git a/runtime/src/iree/hal/local/executable_loader.h b/runtime/src/iree/hal/local/executable_loader.h
index ae8f6dc..426d6a4 100644
--- a/runtime/src/iree/hal/local/executable_loader.h
+++ b/runtime/src/iree/hal/local/executable_loader.h
@@ -122,7 +122,7 @@
 iree_status_t iree_hal_executable_loader_try_load(
     iree_hal_executable_loader_t* executable_loader,
     const iree_hal_executable_params_t* executable_params,
-    iree_hal_executable_t** out_executable);
+    iree_host_size_t worker_capacity, iree_hal_executable_t** out_executable);
 
 //===----------------------------------------------------------------------===//
 // iree_hal_executable_loader_t implementation details
@@ -139,7 +139,7 @@
   iree_status_t(IREE_API_PTR* try_load)(
       iree_hal_executable_loader_t* executable_loader,
       const iree_hal_executable_params_t* executable_params,
-      iree_hal_executable_t** out_executable);
+      iree_host_size_t worker_capacity, iree_hal_executable_t** out_executable);
 } iree_hal_executable_loader_vtable_t;
 
 #ifdef __cplusplus
diff --git a/runtime/src/iree/hal/local/loaders/embedded_elf_loader.c b/runtime/src/iree/hal/local/loaders/embedded_elf_loader.c
index 31bda30..ddcc1f4 100644
--- a/runtime/src/iree/hal/local/loaders/embedded_elf_loader.c
+++ b/runtime/src/iree/hal/local/loaders/embedded_elf_loader.c
@@ -246,7 +246,8 @@
 static iree_status_t iree_hal_elf_executable_issue_call(
     iree_hal_local_executable_t* base_executable, iree_host_size_t ordinal,
     const iree_hal_executable_dispatch_state_v0_t* dispatch_state,
-    const iree_hal_executable_workgroup_state_v0_t* workgroup_state) {
+    const iree_hal_executable_workgroup_state_v0_t* workgroup_state,
+    uint32_t worker_id) {
   iree_hal_elf_executable_t* executable =
       (iree_hal_elf_executable_t*)base_executable;
   const iree_hal_executable_library_v0_t* library = executable->library.v0;
@@ -369,7 +370,7 @@
 static iree_status_t iree_hal_embedded_elf_loader_try_load(
     iree_hal_executable_loader_t* base_executable_loader,
     const iree_hal_executable_params_t* executable_params,
-    iree_hal_executable_t** out_executable) {
+    iree_host_size_t worker_capacity, iree_hal_executable_t** out_executable) {
   iree_hal_embedded_elf_loader_t* executable_loader =
       (iree_hal_embedded_elf_loader_t*)base_executable_loader;
   IREE_TRACE_ZONE_BEGIN(z0);
diff --git a/runtime/src/iree/hal/local/loaders/static_library_loader.c b/runtime/src/iree/hal/local/loaders/static_library_loader.c
index bad62ae..875acab 100644
--- a/runtime/src/iree/hal/local/loaders/static_library_loader.c
+++ b/runtime/src/iree/hal/local/loaders/static_library_loader.c
@@ -117,7 +117,8 @@
 static iree_status_t iree_hal_static_executable_issue_call(
     iree_hal_local_executable_t* base_executable, iree_host_size_t ordinal,
     const iree_hal_executable_dispatch_state_v0_t* dispatch_state,
-    const iree_hal_executable_workgroup_state_v0_t* workgroup_state) {
+    const iree_hal_executable_workgroup_state_v0_t* workgroup_state,
+    uint32_t worker_id) {
   iree_hal_static_executable_t* executable =
       (iree_hal_static_executable_t*)base_executable;
   const iree_hal_executable_library_v0_t* library = executable->library.v0;
@@ -286,7 +287,7 @@
 static iree_status_t iree_hal_static_library_loader_try_load(
     iree_hal_executable_loader_t* base_executable_loader,
     const iree_hal_executable_params_t* executable_params,
-    iree_hal_executable_t** out_executable) {
+    iree_host_size_t worker_capacity, iree_hal_executable_t** out_executable) {
   iree_hal_static_library_loader_t* executable_loader =
       (iree_hal_static_library_loader_t*)base_executable_loader;
 
diff --git a/runtime/src/iree/hal/local/loaders/system_library_loader.c b/runtime/src/iree/hal/local/loaders/system_library_loader.c
index 3711927..14d2c04 100644
--- a/runtime/src/iree/hal/local/loaders/system_library_loader.c
+++ b/runtime/src/iree/hal/local/loaders/system_library_loader.c
@@ -366,7 +366,8 @@
 static iree_status_t iree_hal_system_executable_issue_call(
     iree_hal_local_executable_t* base_executable, iree_host_size_t ordinal,
     const iree_hal_executable_dispatch_state_v0_t* dispatch_state,
-    const iree_hal_executable_workgroup_state_v0_t* workgroup_state) {
+    const iree_hal_executable_workgroup_state_v0_t* workgroup_state,
+    uint32_t worker_id) {
   iree_hal_system_executable_t* executable =
       (iree_hal_system_executable_t*)base_executable;
   const iree_hal_executable_library_v0_t* library = executable->library.v0;
@@ -499,7 +500,7 @@
 static iree_status_t iree_hal_system_library_loader_try_load(
     iree_hal_executable_loader_t* base_executable_loader,
     const iree_hal_executable_params_t* executable_params,
-    iree_hal_executable_t** out_executable) {
+    iree_host_size_t worker_capacity, iree_hal_executable_t** out_executable) {
   iree_hal_system_library_loader_t* executable_loader =
       (iree_hal_system_library_loader_t*)base_executable_loader;
   IREE_TRACE_ZONE_BEGIN(z0);
diff --git a/runtime/src/iree/hal/local/loaders/vmvx_module_loader.c b/runtime/src/iree/hal/local/loaders/vmvx_module_loader.c
index 92307cf..8e0b3e4 100644
--- a/runtime/src/iree/hal/local/loaders/vmvx_module_loader.c
+++ b/runtime/src/iree/hal/local/loaders/vmvx_module_loader.c
@@ -18,55 +18,24 @@
 #include "iree/modules/vmvx/module.h"
 #include "iree/vm/bytecode_module.h"
 
-//===----------------------------------------------------------------------===//
-// iree_hal_vmvx_executable_t
-//===----------------------------------------------------------------------===//
-
 #define IREE_VMVX_ENTRY_SIGNATURE "0rrriiiiiiiii_v"
 
-typedef struct iree_hal_vmvx_executable_t {
-  iree_hal_local_executable_t base;
+// Index of the module in the context_modules list.
+// This should always be first so that it can be overridden by user modules.
+#define IREE_VMVX_MODULE_INDEX 0
 
-  // Context containing both the VMVX module and the loaded executable.
-  // This context may also contain custom user modules available for the
-  // generated VMVX modules to use.
-  iree_vm_context_t* context;
+//===----------------------------------------------------------------------===//
+// Built-in executable helpers
+//===----------------------------------------------------------------------===//
 
-  // Resolved entry function export ordinals from the bytecode module.
-  iree_vm_module_t* bytecode_module;
-  iree_host_size_t entry_fn_count;
-  uint16_t entry_fn_ordinals[];
-} iree_hal_vmvx_executable_t;
-
-static const iree_hal_local_executable_vtable_t iree_hal_vmvx_executable_vtable;
-
-// Verifies that an entry point function exported by the bytecode module matches
-// the calling convention we expect. This avoids the need to check it during
-// dispatch (where returning errors is hard and it'd be expensive).
-static iree_status_t iree_hal_vmvx_executable_verify_entry_point(
-    iree_vm_function_t* entry_fn) {
-  iree_vm_function_signature_t signature = iree_vm_function_signature(entry_fn);
-  if (!iree_string_view_equal(
-          signature.calling_convention,
-          iree_make_cstring_view(IREE_VMVX_ENTRY_SIGNATURE))) {
-    return iree_make_status(
-        IREE_STATUS_INVALID_ARGUMENT,
-        "executable entry point does not match the expected calling "
-        "convention; expected '" IREE_VMVX_ENTRY_SIGNATURE
-        "' but got '%.*s', possible ABI version mismatch",
-        (int)signature.calling_convention.size,
-        signature.calling_convention.data);
-  }
-  return iree_ok_status();
-}
-
-// Calls the __set_constants method on |executable| with the given |constants|.
-// We wrap the data in VM buffer and require that it is not retained by the
-// module; the constant values should be extracted and stored in globals.
-// Fails if the constant table is not of the required size.
+// Calls the __set_constants method in |bytecode_module| with the given
+// |constants|. We wrap the data in VM buffer and require that it is not
+// retained by the module; the constant values should be extracted and stored in
+// globals. Fails if the constant table is not of the required size.
 static iree_status_t iree_hal_vmvx_executable_set_constants(
-    iree_hal_vmvx_executable_t* executable, iree_vm_module_t* bytecode_module,
-    iree_host_size_t constant_count, const uint32_t* constants) {
+    iree_vm_context_t* context, iree_vm_module_t* bytecode_module,
+    iree_host_size_t constant_count, const uint32_t* constants,
+    iree_allocator_t host_allocator) {
   // Look for the exported function. If it's not present then no constants are
   // required and if it is then we must have at least one constant.
   iree_vm_function_t set_function;
@@ -119,10 +88,10 @@
 
   // Copy the executable constants into the module state.
   if (iree_status_is_ok(status)) {
-    status = iree_vm_invoke(executable->context, set_function,
+    status = iree_vm_invoke(context, set_function,
                             IREE_VM_INVOCATION_FLAG_TRACE_INLINE,
                             /*policy=*/NULL, inputs,
-                            /*outputs=*/NULL, executable->base.host_allocator);
+                            /*outputs=*/NULL, host_allocator);
   }
 
   // Inputs *must* be released here as we allocated it on the stack.
@@ -138,11 +107,120 @@
   return status;
 }
 
+//===----------------------------------------------------------------------===//
+// iree_hal_vmvx_worker_state_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_vmvx_worker_state_t {
+  // Context containing both the VMVX module and the loaded executable.
+  // This context may also contain custom user modules available for the
+  // generated VMVX modules to use.
+  iree_vm_context_t* context;
+
+  // Pointer into the VMVX module state for the worker context.
+  // This is used to update module state directly.
+  iree_vm_module_state_t* vmvx_module_state;
+} iree_hal_vmvx_worker_state_t;
+
+static iree_status_t iree_hal_vmvx_worker_state_initialize(
+    iree_vm_instance_t* instance, iree_host_size_t module_count,
+    iree_vm_module_t** modules, iree_vm_module_t* bytecode_module,
+    const iree_hal_executable_params_t* executable_params,
+    iree_allocator_t host_allocator, iree_hal_vmvx_worker_state_t* out_state) {
+  IREE_ASSERT_ARGUMENT(out_state);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  memset(out_state, 0, sizeof(*out_state));
+
+  // Create the context unique to this worker.
+  iree_vm_context_t* context = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_vm_context_create_with_modules(
+              instance, IREE_VM_CONTEXT_FLAG_NONE, module_count, modules,
+              host_allocator, &context));
+
+  // Fetch the VMVX module state so that we can quickly access it to set
+  // per-call state.
+  iree_vm_module_t* vmvx_module = modules[IREE_VMVX_MODULE_INDEX];
+  iree_vm_module_state_t* vmvx_module_state = NULL;
+  iree_status_t status = iree_vm_context_resolve_module_state(
+      context, vmvx_module, &vmvx_module_state);
+
+  // Set executable-level constants.
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_vmvx_executable_set_constants(
+        context, bytecode_module, executable_params->constant_count,
+        executable_params->constants, host_allocator);
+  }
+
+  if (iree_status_is_ok(status)) {
+    out_state->context = context;
+    out_state->vmvx_module_state = vmvx_module_state;
+  } else {
+    iree_vm_context_release(context);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_vmvx_worker_state_deinitialize(
+    iree_hal_vmvx_worker_state_t* state) {
+  IREE_ASSERT_ARGUMENT(state);
+  IREE_TRACE_ZONE_BEGIN(z0);
+  if (state->context) {
+    iree_vm_context_release(state->context);
+    state->context = NULL;
+  }
+  IREE_TRACE_ZONE_END(z0);
+}
+
+//===----------------------------------------------------------------------===//
+// iree_hal_vmvx_executable_t
+//===----------------------------------------------------------------------===//
+
+typedef struct iree_hal_vmvx_executable_t {
+  iree_hal_local_executable_t base;
+
+  // Loaded VMVX module shared across all workers.
+  iree_vm_module_t* bytecode_module;
+
+  // Preallocated per-worker states that are used to emulate TLS.
+  iree_host_size_t worker_capacity;
+  iree_hal_vmvx_worker_state_t* worker_states;
+
+  // Resolved entry function export ordinals from the bytecode module.
+  iree_host_size_t entry_fn_count;
+  uint16_t entry_fn_ordinals[];
+} iree_hal_vmvx_executable_t;
+
+static const iree_hal_local_executable_vtable_t iree_hal_vmvx_executable_vtable;
+
+// Verifies that an entry point function exported by the bytecode module matches
+// the calling convention we expect. This avoids the need to check it during
+// dispatch (where returning errors is hard and it'd be expensive).
+static iree_status_t iree_hal_vmvx_executable_verify_entry_point(
+    iree_vm_function_t* entry_fn) {
+  iree_vm_function_signature_t signature = iree_vm_function_signature(entry_fn);
+  if (!iree_string_view_equal(
+          signature.calling_convention,
+          iree_make_cstring_view(IREE_VMVX_ENTRY_SIGNATURE))) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "executable entry point does not match the expected calling "
+        "convention; expected '" IREE_VMVX_ENTRY_SIGNATURE
+        "' but got '%.*s', possible ABI version mismatch",
+        (int)signature.calling_convention.size,
+        signature.calling_convention.data);
+  }
+  return iree_ok_status();
+}
+
 static iree_status_t iree_hal_vmvx_executable_create(
-    iree_vm_context_t* context, iree_vm_module_t* bytecode_module,
+    iree_vm_instance_t* instance, iree_host_size_t module_count,
+    iree_vm_module_t** modules, iree_vm_module_t* bytecode_module,
+    iree_host_size_t worker_capacity,
     const iree_hal_executable_params_t* executable_params,
     iree_allocator_t host_allocator, iree_hal_executable_t** out_executable) {
-  IREE_ASSERT_ARGUMENT(context);
+  IREE_ASSERT_ARGUMENT(instance);
   IREE_ASSERT_ARGUMENT(bytecode_module);
   IREE_ASSERT_ARGUMENT(executable_params);
   IREE_ASSERT_ARGUMENT(!executable_params->pipeline_layout_count ||
@@ -164,30 +242,40 @@
   }
 
   iree_hal_vmvx_executable_t* executable = NULL;
-  iree_host_size_t total_size =
-      sizeof(*executable) +
-      entry_count * sizeof(*executable->entry_fn_ordinals) +
-      entry_count * sizeof(*executable->base.dispatch_attrs) +
-      executable_params->pipeline_layout_count *
-          sizeof(iree_hal_pipeline_layout_t*);
+  const iree_host_size_t entry_fn_ordinals_size =
+      iree_host_align(entry_count * sizeof(*executable->entry_fn_ordinals), 8);
+  const iree_host_size_t dispatch_attrs_size = iree_host_align(
+      entry_count * sizeof(*executable->base.dispatch_attrs), 8);
+  const iree_host_size_t pipeline_layouts_size =
+      iree_host_align(executable_params->pipeline_layout_count *
+                          sizeof(iree_hal_pipeline_layout_t*),
+                      8);
+  const iree_host_size_t worker_states_size =
+      iree_host_align(worker_capacity * sizeof(*executable->worker_states), 8);
+  const iree_host_size_t total_size =
+      sizeof(*executable) + entry_fn_ordinals_size + dispatch_attrs_size +
+      pipeline_layouts_size + worker_states_size;
   iree_status_t status =
       iree_allocator_malloc(host_allocator, total_size, (void**)&executable);
   iree_hal_executable_dispatch_attrs_v0_t* dispatch_attrs = NULL;
   if (iree_status_is_ok(status)) {
-    uint8_t* ptr = (uint8_t*)executable + sizeof(*executable) +
-                   entry_count * sizeof(*executable->entry_fn_ordinals);
+    uint8_t* ptr =
+        (uint8_t*)executable + sizeof(*executable) + entry_fn_ordinals_size;
     dispatch_attrs = (iree_hal_executable_dispatch_attrs_v0_t*)ptr;
-    ptr += entry_count * sizeof(*executable->base.dispatch_attrs);
+    ptr += dispatch_attrs_size;
     iree_hal_pipeline_layout_t** pipeline_layouts_ptr =
         (iree_hal_pipeline_layout_t**)ptr;
+    ptr += pipeline_layouts_size;
     iree_hal_local_executable_initialize(
         &iree_hal_vmvx_executable_vtable,
         executable_params->pipeline_layout_count,
         executable_params->pipeline_layouts, pipeline_layouts_ptr,
         host_allocator, &executable->base);
-    executable->context = context;
     executable->base.dispatch_attrs = dispatch_attrs;
-    iree_vm_context_retain(executable->context);
+
+    executable->worker_capacity = worker_capacity;
+    executable->worker_states = (iree_hal_vmvx_worker_state_t*)ptr;
+    ptr += worker_states_size;
 
     executable->bytecode_module = bytecode_module;
     executable->entry_fn_count = entry_count;
@@ -227,11 +315,14 @@
     }
   }
 
-  // Provide executable constants to the module.
+  // Initialize a context per worker requested.
   if (iree_status_is_ok(status)) {
-    status = iree_hal_vmvx_executable_set_constants(
-        executable, bytecode_module, executable_params->constant_count,
-        executable_params->constants);
+    for (iree_host_size_t i = 0; i < worker_capacity; ++i) {
+      status = iree_hal_vmvx_worker_state_initialize(
+          instance, module_count, modules, bytecode_module, executable_params,
+          host_allocator, &executable->worker_states[i]);
+      if (!iree_status_is_ok(status)) break;
+    }
   }
 
   if (iree_status_is_ok(status)) {
@@ -250,7 +341,9 @@
   iree_allocator_t host_allocator = executable->base.host_allocator;
   IREE_TRACE_ZONE_BEGIN(z0);
 
-  iree_vm_context_release(executable->context);
+  for (iree_host_size_t i = 0; i < executable->worker_capacity; ++i) {
+    iree_hal_vmvx_worker_state_deinitialize(&executable->worker_states[i]);
+  }
   iree_hal_local_executable_deinitialize(
       (iree_hal_local_executable_t*)base_executable);
   iree_allocator_free(host_allocator, executable);
@@ -261,10 +354,12 @@
 static iree_status_t iree_hal_vmvx_executable_issue_call(
     iree_hal_local_executable_t* base_executable, iree_host_size_t ordinal,
     const iree_hal_executable_dispatch_state_v0_t* dispatch_state,
-    const iree_hal_executable_workgroup_state_v0_t* workgroup_state) {
+    const iree_hal_executable_workgroup_state_v0_t* workgroup_state,
+    uint32_t worker_id) {
   iree_hal_vmvx_executable_t* executable =
       (iree_hal_vmvx_executable_t*)base_executable;
 
+  // Map the export ordinal to the exported function in the bytecode module.
   if (IREE_UNLIKELY(ordinal >= executable->entry_fn_count)) {
     return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                             "entry point ordinal out of bounds");
@@ -275,14 +370,16 @@
       .ordinal = executable->entry_fn_ordinals[ordinal],
   };
 
-#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
-  iree_string_view_t entry_point_name = iree_vm_function_name(&entry_fn);
-  if (iree_string_view_is_empty(entry_point_name)) {
-    entry_point_name = iree_make_cstring_view("unknown_vmvx_call");
+  // Fetch worker-local state. This caller is the only one able to access it so
+  // no synchronization is required.
+  if (IREE_UNLIKELY(worker_id >= executable->worker_capacity)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "worker_id out of bounds");
   }
-  IREE_TRACE_ZONE_BEGIN_NAMED_DYNAMIC(z0, entry_point_name.data,
-                                      entry_point_name.size);
-#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  iree_hal_vmvx_worker_state_t* worker_state =
+      &executable->worker_states[worker_id];
+  iree_vmvx_module_state_update_workgroup_state(worker_state->vmvx_module_state,
+                                                workgroup_state->processor_id);
 
   // On-stack interface local to this invocation.
   // Note that we _could_ share this across all invocations in a dispatch, but
@@ -296,13 +393,12 @@
       iree_vm_list_storage_size(&buffer_type, dispatch_state->binding_count);
   void* binding_list_storage = iree_alloca(binding_list_size);
   iree_vm_list_t* binding_list = NULL;
-  IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_vm_list_initialize(
-              iree_make_byte_span(binding_list_storage, binding_list_size),
-              &buffer_type, dispatch_state->binding_count, &binding_list));
-  iree_vm_list_retain(binding_list);  // for call
+  IREE_RETURN_IF_ERROR(iree_vm_list_initialize(
+      iree_make_byte_span(binding_list_storage, binding_list_size),
+      &buffer_type, dispatch_state->binding_count, &binding_list));
 
   // Map bindings into on-stack VMVX buffers.
+  iree_status_t status = iree_ok_status();
   iree_vm_buffer_t* binding_buffers = (iree_vm_buffer_t*)iree_alloca(
       dispatch_state->binding_count * sizeof(iree_vm_buffer_t));
   for (iree_host_size_t i = 0; i < dispatch_state->binding_count; ++i) {
@@ -318,11 +414,15 @@
                             dispatch_state->binding_lengths[i]),
         iree_allocator_null(), binding_buffer);
     iree_vm_ref_t ref = {0};
-    IREE_RETURN_AND_END_ZONE_IF_ERROR(
-        z0, iree_vm_ref_wrap_assign(binding_buffer, iree_vm_buffer_type_id(),
-                                    &ref));
-    IREE_RETURN_AND_END_ZONE_IF_ERROR(
-        z0, iree_vm_list_push_ref_retain(binding_list, &ref));
+    status =
+        iree_vm_ref_wrap_assign(binding_buffer, iree_vm_buffer_type_id(), &ref);
+    if (!iree_status_is_ok(status)) break;
+    status = iree_vm_list_push_ref_retain(binding_list, &ref);
+    if (!iree_status_is_ok(status)) break;
+  }
+  if (!iree_status_is_ok(status)) {
+    iree_vm_list_deinitialize(binding_list);
+    return status;
   }
 
   // Acquire workgroup local memory for the dispatch.
@@ -332,7 +432,6 @@
       iree_make_byte_span(workgroup_state->local_memory,
                           workgroup_state->local_memory_size),
       iree_allocator_null(), &local_memory_buffer);
-  iree_vm_buffer_retain(&local_memory_buffer);  // for call
 
   // Map the push constant memory directly from the dispatch state.
   iree_vm_buffer_t constants_buffer;
@@ -342,7 +441,6 @@
           (void*)dispatch_state->push_constants,
           sizeof(uint32_t) * dispatch_state->push_constant_count),
       iree_allocator_null(), &constants_buffer);
-  iree_vm_buffer_retain(&constants_buffer);  // for call
 
   // Prepare call argument buffer. We've verified the signature on creation and
   // know the exact format we can assume here.
@@ -407,12 +505,18 @@
       .workgroup_count_z = dispatch_state->workgroup_count_z,
   };
 
-  // On-stack stack. We really do abuse the stack too much here.
+  // Call arguments are retained by the caller.
+  iree_vm_list_retain(binding_list);            // for call
+  iree_vm_buffer_retain(&local_memory_buffer);  // for call
+  iree_vm_buffer_retain(&constants_buffer);     // for call
+
+  // VM stack stored on native stack. We really do abuse the stack too much
+  // here but it's 8KB and that should be reasonable given that there isn't too
+  // much above us in the stack.
   // TODO(benvanik): pass in an iree_arena_t that can be used for this.
-  // TODO(benvanik): invocation flag that prevents global stores.
   IREE_VM_INLINE_STACK_INITIALIZE(
       stack, IREE_VM_INVOCATION_FLAG_TRACE_INLINE,
-      iree_vm_context_state_resolver(executable->context),
+      iree_vm_context_state_resolver(worker_state->context),
       executable->base.host_allocator);
 
   // Direct call interface.
@@ -423,9 +527,9 @@
   call.function = entry_fn;
   call.arguments = iree_make_byte_span(&call_args, sizeof(call_args));
   call.results = iree_make_byte_span(NULL, 0);
-  iree_status_t status =
-      entry_fn.module->begin_call(entry_fn.module->self, stack, call);
+  status = entry_fn.module->begin_call(entry_fn.module->self, stack, call);
 
+  // Clean up the stack if needed, such as when the call fails.
   iree_vm_stack_deinitialize(stack);
 
   iree_vm_buffer_deinitialize(&local_memory_buffer);
@@ -435,7 +539,6 @@
     iree_vm_buffer_deinitialize(&binding_buffers[i]);
   }
 
-  IREE_TRACE_ZONE_END(z0);
   return status;
 }
 
@@ -497,7 +600,7 @@
     // This yields a single ordered list of modules to pass into contexts with
     // the generated module coming last so it can resolve imports from all.
     executable_loader->common_module_count = common_module_count;
-    executable_loader->common_modules[0] = vmvx_module;
+    executable_loader->common_modules[IREE_VMVX_MODULE_INDEX] = vmvx_module;
     iree_vm_module_retain(vmvx_module);
     for (iree_host_size_t i = 0; i < user_module_count; ++i) {
       executable_loader->common_modules[1 + i] = user_modules[i];
@@ -560,7 +663,7 @@
 static iree_status_t iree_hal_vmvx_module_loader_try_load(
     iree_hal_executable_loader_t* base_executable_loader,
     const iree_hal_executable_params_t* executable_params,
-    iree_hal_executable_t** out_executable) {
+    iree_host_size_t worker_capacity, iree_hal_executable_t** out_executable) {
   iree_hal_vmvx_module_loader_t* executable_loader =
       (iree_hal_vmvx_module_loader_t*)base_executable_loader;
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -593,12 +696,12 @@
       bytecode_module_allocator, executable_loader->host_allocator,
       &bytecode_module);
 
-  // Create the context tying together the shared VMVX module and the
-  // user-provided module that references it. We always link the compiled module
-  // in last so that it can use the VMVX module as well as any user-provided
-  // modules.
-  iree_vm_context_t* context = NULL;
+  // Executable takes ownership of the entire context (including the bytecode
+  // module, which itself may own the underlying allocation).
   if (iree_status_is_ok(status)) {
+    // Merge the context modules into a single flat list (as we have to pass
+    // that down the API chain). If we had more than 2 modules this would be
+    // worth fixing.
     iree_host_size_t context_module_count =
         executable_loader->common_module_count + 1;
     iree_vm_module_t** context_modules = (iree_vm_module_t**)iree_alloca(
@@ -607,21 +710,14 @@
            executable_loader->common_module_count *
                sizeof(executable_loader->common_modules[0]));
     context_modules[context_module_count - 1] = bytecode_module;
-    status = iree_vm_context_create_with_modules(
-        executable_loader->instance, IREE_VM_CONTEXT_FLAG_CONCURRENT,
-        context_module_count, context_modules,
-        executable_loader->host_allocator, &context);
-  }
 
-  // Executable takes ownership of the entire context (including the bytecode
-  // module, which itself may own the underlying allocation).
-  if (iree_status_is_ok(status)) {
+    // Create the executable, including the VM contexts for each worker.
     status = iree_hal_vmvx_executable_create(
-        context, bytecode_module, executable_params,
+        executable_loader->instance, context_module_count, context_modules,
+        bytecode_module, worker_capacity, executable_params,
         executable_loader->host_allocator, out_executable);
   }
 
-  iree_vm_context_release(context);
   iree_vm_module_release(bytecode_module);
 
   IREE_TRACE_ZONE_END(z0);
diff --git a/runtime/src/iree/hal/local/local_executable.c b/runtime/src/iree/hal/local/local_executable.c
index ea44886..1ae0626 100644
--- a/runtime/src/iree/hal/local/local_executable.c
+++ b/runtime/src/iree/hal/local/local_executable.c
@@ -50,13 +50,15 @@
 iree_status_t iree_hal_local_executable_issue_call(
     iree_hal_local_executable_t* executable, iree_host_size_t ordinal,
     const iree_hal_executable_dispatch_state_v0_t* dispatch_state,
-    const iree_hal_executable_workgroup_state_v0_t* workgroup_state) {
+    const iree_hal_executable_workgroup_state_v0_t* workgroup_state,
+    uint32_t worker_id) {
   IREE_ASSERT_ARGUMENT(executable);
   IREE_ASSERT_ARGUMENT(dispatch_state);
   IREE_ASSERT_ARGUMENT(workgroup_state);
   return ((const iree_hal_local_executable_vtable_t*)
               executable->resource.vtable)
-      ->issue_call(executable, ordinal, dispatch_state, workgroup_state);
+      ->issue_call(executable, ordinal, dispatch_state, workgroup_state,
+                   worker_id);
 }
 
 iree_status_t iree_hal_local_executable_issue_dispatch_inline(
@@ -95,7 +97,8 @@
       for (uint32_t x = 0; x < workgroup_count_x; ++x) {
         workgroup_state.workgroup_id_x = x;
         status = iree_hal_local_executable_issue_call(
-            executable, ordinal, dispatch_state, &workgroup_state);
+            executable, ordinal, dispatch_state, &workgroup_state,
+            /*worker_id=*/0);
         if (!iree_status_is_ok(status)) break;
       }
     }
diff --git a/runtime/src/iree/hal/local/local_executable.h b/runtime/src/iree/hal/local/local_executable.h
index 819ef27..6eeb038 100644
--- a/runtime/src/iree/hal/local/local_executable.h
+++ b/runtime/src/iree/hal/local/local_executable.h
@@ -45,7 +45,8 @@
   iree_status_t(IREE_API_PTR* issue_call)(
       iree_hal_local_executable_t* executable, iree_host_size_t ordinal,
       const iree_hal_executable_dispatch_state_v0_t* dispatch_state,
-      const iree_hal_executable_workgroup_state_v0_t* workgroup_state);
+      const iree_hal_executable_workgroup_state_v0_t* workgroup_state,
+      uint32_t worker_id);
 } iree_hal_local_executable_vtable_t;
 
 // Initializes the local executable base type.
@@ -69,7 +70,8 @@
 iree_status_t iree_hal_local_executable_issue_call(
     iree_hal_local_executable_t* executable, iree_host_size_t ordinal,
     const iree_hal_executable_dispatch_state_v0_t* dispatch_state,
-    const iree_hal_executable_workgroup_state_v0_t* workgroup_state);
+    const iree_hal_executable_workgroup_state_v0_t* workgroup_state,
+    uint32_t worker_id);
 
 iree_status_t iree_hal_local_executable_issue_dispatch_inline(
     iree_hal_local_executable_t* executable, iree_host_size_t ordinal,
diff --git a/runtime/src/iree/hal/local/local_executable_cache.c b/runtime/src/iree/hal/local/local_executable_cache.c
index c446c28..f496e2d 100644
--- a/runtime/src/iree/hal/local/local_executable_cache.c
+++ b/runtime/src/iree/hal/local/local_executable_cache.c
@@ -15,6 +15,7 @@
   iree_hal_resource_t resource;
   iree_allocator_t host_allocator;
   iree_string_view_t identifier;
+  iree_host_size_t worker_capacity;
   iree_host_size_t loader_count;
   iree_hal_executable_loader_t* loaders[];
 } iree_hal_local_executable_cache_t;
@@ -29,8 +30,9 @@
 }
 
 iree_status_t iree_hal_local_executable_cache_create(
-    iree_string_view_t identifier, iree_host_size_t loader_count,
-    iree_hal_executable_loader_t** loaders, iree_allocator_t host_allocator,
+    iree_string_view_t identifier, iree_host_size_t worker_capacity,
+    iree_host_size_t loader_count, iree_hal_executable_loader_t** loaders,
+    iree_allocator_t host_allocator,
     iree_hal_executable_cache_t** out_executable_cache) {
   IREE_ASSERT_ARGUMENT(!loader_count || loaders);
   IREE_ASSERT_ARGUMENT(out_executable_cache);
@@ -51,6 +53,7 @@
     iree_string_view_append_to_buffer(
         identifier, &executable_cache->identifier,
         (char*)executable_cache + total_size - identifier.size);
+    executable_cache->worker_capacity = worker_capacity;
 
     executable_cache->loader_count = loader_count;
     for (iree_host_size_t i = 0; i < executable_cache->loader_count; ++i) {
@@ -112,7 +115,8 @@
     // supported then the try will fail with IREE_STATUS_CANCELLED and we should
     // continue trying other loaders.
     iree_status_t status = iree_hal_executable_loader_try_load(
-        executable_cache->loaders[i], executable_params, out_executable);
+        executable_cache->loaders[i], executable_params,
+        executable_cache->worker_capacity, out_executable);
     if (iree_status_is_ok(status)) {
       // Executable was successfully loaded.
       return status;
diff --git a/runtime/src/iree/hal/local/local_executable_cache.h b/runtime/src/iree/hal/local/local_executable_cache.h
index 0bec265..7721aeb 100644
--- a/runtime/src/iree/hal/local/local_executable_cache.h
+++ b/runtime/src/iree/hal/local/local_executable_cache.h
@@ -24,8 +24,9 @@
 // situations we're likely to want that isolation _and_ sharing.
 
 iree_status_t iree_hal_local_executable_cache_create(
-    iree_string_view_t identifier, iree_host_size_t loader_count,
-    iree_hal_executable_loader_t** loaders, iree_allocator_t host_allocator,
+    iree_string_view_t identifier, iree_host_size_t worker_capacity,
+    iree_host_size_t loader_count, iree_hal_executable_loader_t** loaders,
+    iree_allocator_t host_allocator,
     iree_hal_executable_cache_t** out_executable_cache);
 
 #ifdef __cplusplus
diff --git a/runtime/src/iree/modules/hal/loader/module.c b/runtime/src/iree/modules/hal/loader/module.c
index cb4cc10..d1b8be4 100644
--- a/runtime/src/iree/modules/hal/loader/module.c
+++ b/runtime/src/iree/modules/hal/loader/module.c
@@ -156,7 +156,7 @@
     // supported then the try will fail with IREE_STATUS_CANCELLED and we should
     // continue trying other loaders.
     iree_status_t status = iree_hal_executable_loader_try_load(
-        loader, executable_params, out_executable);
+        loader, executable_params, /*worker_capacity=*/1, out_executable);
     if (iree_status_is_ok(status)) {
       // Executable was successfully loaded.
       return status;
diff --git a/runtime/src/iree/modules/vmvx/module.c b/runtime/src/iree/modules/vmvx/module.c
index 9f3e998..4123156 100644
--- a/runtime/src/iree/modules/vmvx/module.c
+++ b/runtime/src/iree/modules/vmvx/module.c
@@ -44,6 +44,11 @@
 typedef struct iree_vmvx_module_state_t {
   iree_allocator_t host_allocator;
 
+  // Logical processor identifier used to index into processor info fields.
+  // Depending on the implementation this may be an ordinal, a bitfield, or an
+  // opaque unique identifier.
+  uint32_t processor_id;
+
   // If we have any external libraries we want to interact with that are
   // stateful we could store their state here. Note that VMVX invocations may
   // happen from any thread and concurrently and if the state is not thread-safe
@@ -888,3 +893,9 @@
   *out_module = base_module;
   return iree_ok_status();
 }
+
+IREE_API_EXPORT void iree_vmvx_module_state_update_workgroup_state(
+    iree_vm_module_state_t* module_state, uint32_t processor_id) {
+  iree_vmvx_module_state_t* state = (iree_vmvx_module_state_t*)module_state;
+  state->processor_id = processor_id;
+}
diff --git a/runtime/src/iree/modules/vmvx/module.h b/runtime/src/iree/modules/vmvx/module.h
index 5861a86..38ced39 100644
--- a/runtime/src/iree/modules/vmvx/module.h
+++ b/runtime/src/iree/modules/vmvx/module.h
@@ -21,6 +21,10 @@
     iree_vm_instance_t* instance, iree_allocator_t host_allocator,
     iree_vm_module_t** out_module);
 
+// Updates the context-local state of the module.
+IREE_API_EXPORT void iree_vmvx_module_state_update_workgroup_state(
+    iree_vm_module_state_t* module_state, uint32_t processor_id);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/runtime/src/iree/task/affinity_set.h b/runtime/src/iree/task/affinity_set.h
index e81e549..bf2bf6a 100644
--- a/runtime/src/iree/task/affinity_set.h
+++ b/runtime/src/iree/task/affinity_set.h
@@ -44,6 +44,8 @@
   return UINT64_MAX;
 }
 
+#define iree_task_affinity_set_count_leading_zeros \
+  iree_math_count_leading_zeros_u64
 #define iree_task_affinity_set_count_trailing_zeros \
   iree_math_count_trailing_zeros_u64
 #define iree_task_affinity_set_count_ones iree_math_count_ones_u64
diff --git a/runtime/src/iree/task/task.c b/runtime/src/iree/task/task.c
index aaa8999..c125251 100644
--- a/runtime/src/iree/task/task.c
+++ b/runtime/src/iree/task/task.c
@@ -709,7 +709,7 @@
 
 void iree_task_dispatch_shard_execute(
     iree_task_dispatch_shard_t* task, iree_cpu_processor_id_t processor_id,
-    iree_byte_span_t worker_local_memory,
+    uint32_t worker_id, iree_byte_span_t worker_local_memory,
     iree_task_submission_t* pending_submission) {
   IREE_TRACE_ZONE_BEGIN(z0);
 
@@ -745,6 +745,7 @@
          sizeof(tile_context.workgroup_count));
   uint32_t workgroup_count_x = tile_context.workgroup_count[0];
   uint32_t workgroup_count_y = tile_context.workgroup_count[1];
+  tile_context.worker_id = worker_id;
   tile_context.local_memory = local_memory;
 
   // We perform all our shard statistics work locally here and only push back to
diff --git a/runtime/src/iree/task/task.h b/runtime/src/iree/task/task.h
index aeef180..b6c5ec9 100644
--- a/runtime/src/iree/task/task.h
+++ b/runtime/src/iree/task/task.h
@@ -536,6 +536,9 @@
   // May be slightly out of date or 0 if the processor could not be queried.
   iree_cpu_processor_id_t processor_id;
 
+  // Worker that is processing the tile, [0, worker_capacity).
+  uint32_t worker_id;
+
   // Tile-local memory that is pinned to each worker ensuring no cache
   // thrashing. Aligned to at least the natural pointer size of the machine.
   // Contents are (today) undefined upon entry.
diff --git a/runtime/src/iree/task/task_impl.h b/runtime/src/iree/task/task_impl.h
index ee1b5a3..b114c9d 100644
--- a/runtime/src/iree/task/task_impl.h
+++ b/runtime/src/iree/task/task_impl.h
@@ -122,7 +122,7 @@
 // all shards have completed.
 void iree_task_dispatch_shard_execute(
     iree_task_dispatch_shard_t* task, iree_cpu_processor_id_t processor_id,
-    iree_byte_span_t worker_local_memory,
+    uint32_t worker_id, iree_byte_span_t worker_local_memory,
     iree_task_submission_t* pending_submission);
 
 #ifdef __cplusplus
diff --git a/runtime/src/iree/task/worker.c b/runtime/src/iree/task/worker.c
index 5252bec..72e3377 100644
--- a/runtime/src/iree/task/worker.c
+++ b/runtime/src/iree/task/worker.c
@@ -182,6 +182,7 @@
     case IREE_TASK_TYPE_DISPATCH_SHARD: {
       iree_task_dispatch_shard_execute(
           (iree_task_dispatch_shard_t*)task, worker->processor_id,
+          iree_task_affinity_set_count_trailing_zeros(worker->worker_bit),
           worker->local_memory, pending_submission);
       break;
     }
diff --git a/runtime/src/iree/vm/stack.c b/runtime/src/iree/vm/stack.c
index a805a7b..f3aa4c4 100644
--- a/runtime/src/iree/vm/stack.c
+++ b/runtime/src/iree/vm/stack.c
@@ -230,13 +230,24 @@
   return iree_ok_status();
 }
 
-IREE_API_EXPORT void iree_vm_stack_deinitialize(iree_vm_stack_t* stack) {
+IREE_API_EXPORT void iree_vm_stack_reset(iree_vm_stack_t* stack) {
   IREE_TRACE_ZONE_BEGIN(z0);
 
+  // Pop each frame of the stack in reverse.
   while (stack->top) {
     iree_status_ignore(iree_vm_stack_function_leave(stack));
   }
 
+  IREE_TRACE_ZONE_END(z0);
+}
+
+IREE_API_EXPORT void iree_vm_stack_deinitialize(iree_vm_stack_t* stack) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Release stack frame resources.
+  iree_vm_stack_reset(stack);
+
+  // Drop allocated frame storage.
   if (stack->owns_frame_storage) {
     iree_allocator_free(stack->allocator, stack->frame_storage);
   }
diff --git a/runtime/src/iree/vm/stack.h b/runtime/src/iree/vm/stack.h
index bb77123..7bd6663 100644
--- a/runtime/src/iree/vm/stack.h
+++ b/runtime/src/iree/vm/stack.h
@@ -215,6 +215,9 @@
     iree_vm_state_resolver_t state_resolver, iree_allocator_t allocator,
     iree_vm_stack_t** out_stack);
 
+// Resets the stack to its initial state by popping all stack frames.
+IREE_API_EXPORT void iree_vm_stack_reset(iree_vm_stack_t* stack);
+
 // Deinitializes a statically-allocated |stack| previously initialized with
 // iree_vm_stack_initialize.
 IREE_API_EXPORT void iree_vm_stack_deinitialize(iree_vm_stack_t* stack);