Enabling bytecode module coroutine begin/resume. (#9497)

iree_vm_bytecode_dispatch_begin now begins a call and if it does not
complete prior to returning the caller can repeatedly issue
iree_vm_bytecode_dispatch_resume until it does.

Invocations are marked with fiber enter/leave to allow tracy to
visualize them. Unfortunately due to proper nesting requirements
we can't do the fiber management at a more common level and instead
will need each implementation to manage it around their execution.

Future changes will update iree/vm/invocation.h to support stateful
coroutine-style invocations with a prettier API. This is just the
internals to verify that the bytecode dispatch supports it.

Progress on #8093.
diff --git a/runtime/src/iree/base/tracing.h b/runtime/src/iree/base/tracing.h
index 9ee22e5..784a62a 100644
--- a/runtime/src/iree/base/tracing.h
+++ b/runtime/src/iree/base/tracing.h
@@ -104,28 +104,29 @@
 // IREE_TRACING_MODE = 4: same as 3 with callstacks for all instrumentation
 #if !defined(IREE_TRACING_FEATURES)
 #if defined(IREE_TRACING_MODE) && IREE_TRACING_MODE == 1
-#define IREE_TRACING_FEATURES \
-  (IREE_TRACING_FEATURE_INSTRUMENTATION | IREE_TRACING_FEATURE_LOG_MESSAGES)
+#define IREE_TRACING_FEATURES                                                 \
+  (IREE_TRACING_FEATURE_INSTRUMENTATION | IREE_TRACING_FEATURE_LOG_MESSAGES | \
+   IREE_TRACING_FEATURE_FIBERS)
 #undef IREE_TRACING_MAX_CALLSTACK_DEPTH
 #define IREE_TRACING_MAX_CALLSTACK_DEPTH 0
 #elif defined(IREE_TRACING_MODE) && IREE_TRACING_MODE == 2
 #define IREE_TRACING_FEATURES                 \
   (IREE_TRACING_FEATURE_INSTRUMENTATION |     \
    IREE_TRACING_FEATURE_ALLOCATION_TRACKING | \
-   IREE_TRACING_FEATURE_LOG_MESSAGES)
+   IREE_TRACING_FEATURE_LOG_MESSAGES | IREE_TRACING_FEATURE_FIBERS)
 #elif defined(IREE_TRACING_MODE) && IREE_TRACING_MODE == 3
 #define IREE_TRACING_FEATURES                   \
   (IREE_TRACING_FEATURE_INSTRUMENTATION |       \
    IREE_TRACING_FEATURE_ALLOCATION_TRACKING |   \
    IREE_TRACING_FEATURE_ALLOCATION_CALLSTACKS | \
-   IREE_TRACING_FEATURE_LOG_MESSAGES)
+   IREE_TRACING_FEATURE_LOG_MESSAGES | IREE_TRACING_FEATURE_FIBERS)
 #elif defined(IREE_TRACING_MODE) && IREE_TRACING_MODE >= 4
 #define IREE_TRACING_FEATURES                        \
   (IREE_TRACING_FEATURE_INSTRUMENTATION |            \
    IREE_TRACING_FEATURE_INSTRUMENTATION_CALLSTACKS | \
    IREE_TRACING_FEATURE_ALLOCATION_TRACKING |        \
    IREE_TRACING_FEATURE_ALLOCATION_CALLSTACKS |      \
-   IREE_TRACING_FEATURE_LOG_MESSAGES)
+   IREE_TRACING_FEATURE_LOG_MESSAGES | IREE_TRACING_FEATURE_FIBERS)
 #else
 #define IREE_TRACING_FEATURES 0
 #endif  // IREE_TRACING_MODE
@@ -292,7 +293,7 @@
 #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_FIBERS
 // Enters a fiber context.
 // |fiber| must be unique and remain live for the process lifetime.
-#define IREE_TRACE_FIBER_ENTER(fiber) ___tracy_fiber_enter(fiber)
+#define IREE_TRACE_FIBER_ENTER(fiber) ___tracy_fiber_enter((const char*)fiber)
 // Exits a fiber context.
 #define IREE_TRACE_FIBER_LEAVE() ___tracy_fiber_leave()
 #else
diff --git a/runtime/src/iree/hal/local/loaders/vmvx_module_loader.c b/runtime/src/iree/hal/local/loaders/vmvx_module_loader.c
index 3fd16b3..42618fd 100644
--- a/runtime/src/iree/hal/local/loaders/vmvx_module_loader.c
+++ b/runtime/src/iree/hal/local/loaders/vmvx_module_loader.c
@@ -395,6 +395,7 @@
   // TODO(benvanik): pass in an iree_arena_t that can be used for this.
   IREE_VM_INLINE_STACK_INITIALIZE(
       stack, IREE_VM_INVOCATION_FLAG_NONE,
+      iree_vm_context_id(executable->context),
       iree_vm_context_state_resolver(executable->context),
       executable->base.host_allocator);
 
diff --git a/runtime/src/iree/runtime/session.c b/runtime/src/iree/runtime/session.c
index 1f5d9d9..cbbc9ca 100644
--- a/runtime/src/iree/runtime/session.c
+++ b/runtime/src/iree/runtime/session.c
@@ -293,10 +293,11 @@
   IREE_TRACE_ZONE_BEGIN(z0);
 
   // Allocate a VM stack on the host stack and initialize it.
-  IREE_VM_INLINE_STACK_INITIALIZE(
-      stack, IREE_VM_INVOCATION_FLAG_NONE,
-      iree_vm_context_state_resolver(iree_runtime_session_context(session)),
-      iree_runtime_session_host_allocator(session));
+  iree_vm_context_t* context = iree_runtime_session_context(session);
+  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE,
+                                  iree_vm_context_id(context),
+                                  iree_vm_context_state_resolver(context),
+                                  iree_runtime_session_host_allocator(session));
 
   // Issue the call.
   iree_vm_execution_result_t result;
diff --git a/runtime/src/iree/vm/BUILD b/runtime/src/iree/vm/BUILD
index 465206e..2615571 100644
--- a/runtime/src/iree/vm/BUILD
+++ b/runtime/src/iree/vm/BUILD
@@ -233,13 +233,10 @@
 iree_runtime_cc_test(
     name = "bytecode_module_test",
     srcs = [
+        "bytecode_dispatch_async_test.cc",
         "bytecode_dispatch_test.cc",
         "bytecode_module_test.cc",
     ],
-    tags = [
-        # TODO(benvanik): Fix type casting errors for --config=android_arm.
-        "notap",
-    ],
     deps = [
         ":bytecode_module",
         ":vm",
@@ -248,6 +245,7 @@
         "//runtime/src/iree/testing:gtest",
         "//runtime/src/iree/testing:gtest_main",
         "//runtime/src/iree/vm/test:all_bytecode_modules_c",
+        "//runtime/src/iree/vm/test:async_bytecode_modules_c",
     ],
 )
 
diff --git a/runtime/src/iree/vm/CMakeLists.txt b/runtime/src/iree/vm/CMakeLists.txt
index 2fd7c3c..50afe3d 100644
--- a/runtime/src/iree/vm/CMakeLists.txt
+++ b/runtime/src/iree/vm/CMakeLists.txt
@@ -197,6 +197,7 @@
   NAME
     bytecode_module_test
   SRCS
+    "bytecode_dispatch_async_test.cc"
     "bytecode_dispatch_test.cc"
     "bytecode_module_test.cc"
   DEPS
@@ -207,8 +208,7 @@
     iree::testing::gtest
     iree::testing::gtest_main
     iree::vm::test::all_bytecode_modules_c
-  LABELS
-    "notap"
+    iree::vm::test::async_bytecode_modules_c
 )
 
 iree_cc_binary_benchmark(
diff --git a/runtime/src/iree/vm/bytecode_dispatch.c b/runtime/src/iree/vm/bytecode_dispatch.c
index f81314c..2527a9d 100644
--- a/runtime/src/iree/vm/bytecode_dispatch.c
+++ b/runtime/src/iree/vm/bytecode_dispatch.c
@@ -103,7 +103,7 @@
 
 static iree_status_t iree_vm_bytecode_function_enter(
     iree_vm_stack_t* stack, const iree_vm_function_t function,
-    iree_vm_stack_frame_t** out_callee_frame,
+    iree_string_view_t cconv_results, iree_vm_stack_frame_t** out_callee_frame,
     iree_vm_registers_t* out_callee_registers) {
   iree_vm_bytecode_module_t* module =
       (iree_vm_bytecode_module_t*)function.module->self;
@@ -160,6 +160,7 @@
   iree_vm_bytecode_frame_storage_t* stack_storage =
       (iree_vm_bytecode_frame_storage_t*)iree_vm_stack_frame_storage(
           *out_callee_frame);
+  stack_storage->cconv_results = cconv_results;
   stack_storage->i32_register_count = i32_register_count;
   stack_storage->ref_register_count = ref_register_count;
   stack_storage->i32_register_offset = header_size;
@@ -180,11 +181,11 @@
 static iree_status_t iree_vm_bytecode_external_enter(
     iree_vm_stack_t* stack, const iree_vm_function_t function,
     iree_string_view_t cconv_arguments, iree_byte_span_t arguments,
-    iree_vm_stack_frame_t** out_callee_frame,
+    iree_string_view_t cconv_results, iree_vm_stack_frame_t** out_callee_frame,
     iree_vm_registers_t* out_callee_registers) {
   // Enter the bytecode function and allocate registers.
   IREE_RETURN_IF_ERROR(iree_vm_bytecode_function_enter(
-      stack, function, out_callee_frame, out_callee_registers));
+      stack, function, cconv_results, out_callee_frame, out_callee_registers));
 
   // Marshal arguments from the ABI format to the VM registers.
   iree_vm_registers_t callee_registers = *out_callee_registers;
@@ -232,8 +233,13 @@
     iree_vm_stack_t* stack, iree_vm_stack_frame_t* callee_frame,
     const iree_vm_registers_t* IREE_RESTRICT callee_registers,
     const iree_vm_register_list_t* IREE_RESTRICT src_reg_list,
-    iree_string_view_t cconv_results, iree_byte_span_t results) {
+    iree_byte_span_t results) {
+  const iree_vm_bytecode_frame_storage_t* stack_storage =
+      (iree_vm_bytecode_frame_storage_t*)iree_vm_stack_frame_storage(
+          callee_frame);
+
   // Marshal results from registers to the ABI results buffer.
+  iree_string_view_t cconv_results = stack_storage->cconv_results;
   uint8_t* p = results.data;
   for (iree_host_size_t i = 0; i < cconv_results.size; ++i) {
     uint16_t src_reg = src_reg_list->registers[i];
@@ -289,8 +295,9 @@
   function.module = module;
   function.linkage = IREE_VM_FUNCTION_LINKAGE_INTERNAL;
   function.ordinal = function_ordinal;
-  IREE_RETURN_IF_ERROR(iree_vm_bytecode_function_enter(
-      stack, function, out_callee_frame, out_callee_registers));
+  IREE_RETURN_IF_ERROR(
+      iree_vm_bytecode_function_enter(stack, function, iree_string_view_empty(),
+                                      out_callee_frame, out_callee_registers));
 
   // Remaps argument/result registers from a source list in the caller/callee
   // frame to the 0-N ABI registers in the callee/caller frame.
@@ -634,24 +641,49 @@
 // Main interpreter dispatch routine
 //===----------------------------------------------------------------------===//
 
-iree_status_t iree_vm_bytecode_dispatch(
+static iree_status_t iree_vm_bytecode_dispatch(
+    iree_vm_stack_t* stack, iree_vm_bytecode_module_t* module,
+    iree_vm_stack_frame_t* current_frame, iree_vm_registers_t regs,
+    iree_byte_span_t call_results, iree_vm_execution_result_t* out_result);
+
+iree_status_t iree_vm_bytecode_dispatch_begin(
     iree_vm_stack_t* stack, iree_vm_bytecode_module_t* module,
     const iree_vm_function_call_t* call, iree_string_view_t cconv_arguments,
     iree_string_view_t cconv_results, iree_vm_execution_result_t* out_result) {
-  memset(out_result, 0, sizeof(*out_result));
-
-  // When required emit the dispatch tables here referencing the labels we are
-  // defining below.
-  DEFINE_DISPATCH_TABLES();
-
   // Enter function (as this is the initial call).
   // The callee's return will take care of storing the output registers when it
   // actually does return, either immediately or in the future via a resume.
   iree_vm_stack_frame_t* current_frame = NULL;
   iree_vm_registers_t regs;
-  IREE_RETURN_IF_ERROR(
-      iree_vm_bytecode_external_enter(stack, call->function, cconv_arguments,
-                                      call->arguments, &current_frame, &regs));
+  IREE_RETURN_IF_ERROR(iree_vm_bytecode_external_enter(
+      stack, call->function, cconv_arguments, call->arguments, cconv_results,
+      &current_frame, &regs));
+
+  return iree_vm_bytecode_dispatch(stack, module, current_frame, regs,
+                                   call->results, out_result);
+}
+
+iree_status_t iree_vm_bytecode_dispatch_resume(
+    iree_vm_stack_t* stack, iree_vm_bytecode_module_t* module,
+    iree_byte_span_t call_results, iree_vm_execution_result_t* out_result) {
+  iree_vm_stack_frame_t* current_frame = iree_vm_stack_current_frame(stack);
+  iree_vm_registers_t regs =
+      iree_vm_bytecode_get_register_storage(current_frame);
+  // TODO(benvanik): assert the module is at the top of the frame? We should
+  // only be coming in from a call based on the current frame.
+  return iree_vm_bytecode_dispatch(stack, module, current_frame, regs,
+                                   call_results, out_result);
+}
+
+static iree_status_t iree_vm_bytecode_dispatch(
+    iree_vm_stack_t* stack, iree_vm_bytecode_module_t* module,
+    iree_vm_stack_frame_t* current_frame, iree_vm_registers_t regs,
+    iree_byte_span_t call_results, iree_vm_execution_result_t* out_result) {
+  memset(out_result, 0, sizeof(*out_result));
+
+  // When required emit the dispatch tables here referencing the labels we are
+  // defining below.
+  DEFINE_DISPATCH_TABLES();
 
   // Primary dispatch state. This is our 'native stack frame' and really
   // just enough to make dereferencing common addresses (like the current
@@ -667,7 +699,6 @@
       module->function_descriptor_table[current_frame->function.ordinal]
           .bytecode_offset;
   iree_vm_source_offset_t pc = current_frame->pc;
-  const int32_t entry_frame_depth = current_frame->depth;
 
   BEGIN_DISPATCH_CORE() {
     //===------------------------------------------------------------------===//
@@ -1703,11 +1734,13 @@
           VM_DecVariadicOperands("operands");
       current_frame->pc = pc;
 
-      if (current_frame->depth <= entry_frame_depth) {
+      // TODO(benvanik): faster check for escaping; this is slow (cache misses).
+      iree_vm_stack_frame_t* parent_frame = iree_vm_stack_parent_frame(stack);
+      if (!parent_frame ||
+          parent_frame->module_state != current_frame->module_state) {
         // Return from the top-level entry frame - return back to call().
         return iree_vm_bytecode_external_leave(stack, current_frame, &regs,
-                                               src_reg_list, cconv_results,
-                                               call->results);
+                                               src_reg_list, call_results);
       }
 
       // Store results into the caller frame and pop back to the parent.
@@ -1757,7 +1790,7 @@
       const iree_vm_register_remap_list_t* remap_list =
           VM_DecBranchOperands("operands");
       iree_vm_bytecode_dispatch_remap_branch_registers(regs, remap_list);
-      pc = block_pc;
+      current_frame->pc = block_pc;
 
       // Return magic status code indicating a yield.
       // This isn't an error, though callers not supporting coroutines will
diff --git a/runtime/src/iree/vm/bytecode_dispatch_async_test.cc b/runtime/src/iree/vm/bytecode_dispatch_async_test.cc
new file mode 100644
index 0000000..3baab60
--- /dev/null
+++ b/runtime/src/iree/vm/bytecode_dispatch_async_test.cc
@@ -0,0 +1,165 @@
+// Copyright 2022 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Tests covering the dispatch logic for individual ops.
+//
+// iree/vm/test/async_ops.mlir contains the functions used here for testing. We
+// avoid defining the IR inline here so that we can run this test on platforms
+// that we can't run the full MLIR compiler stack on.
+
+#include "iree/base/logging.h"
+#include "iree/base/status_cc.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+#include "iree/vm/api.h"
+#include "iree/vm/bytecode_module.h"
+
+// Compiled module embedded here to avoid file IO:
+#include "iree/vm/test/async_bytecode_modules.h"
+
+namespace iree {
+namespace {
+
+using iree::testing::status::StatusIs;
+
+class VMBytecodeDispatchAsyncTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    IREE_CHECK_OK(iree_vm_register_builtin_types());
+  }
+
+  void SetUp() override {
+    IREE_TRACE_SCOPE();
+    const iree_file_toc_t* file = async_bytecode_modules_c_create();
+
+    IREE_CHECK_OK(iree_vm_instance_create(iree_allocator_system(), &instance_));
+
+    IREE_CHECK_OK(iree_vm_bytecode_module_create(
+        iree_const_byte_span_t{reinterpret_cast<const uint8_t*>(file->data),
+                               file->size},
+        iree_allocator_null(), iree_allocator_system(), &bytecode_module_));
+
+    std::vector<iree_vm_module_t*> modules = {bytecode_module_};
+    IREE_CHECK_OK(iree_vm_context_create_with_modules(
+        instance_, IREE_VM_CONTEXT_FLAG_NONE, modules.size(), modules.data(),
+        iree_allocator_system(), &context_));
+  }
+
+  void TearDown() override {
+    IREE_TRACE_SCOPE();
+    iree_vm_module_release(bytecode_module_);
+    iree_vm_context_release(context_);
+    iree_vm_instance_release(instance_);
+  }
+
+  iree_vm_instance_t* instance_ = nullptr;
+  iree_vm_context_t* context_ = nullptr;
+  iree_vm_module_t* bytecode_module_ = nullptr;
+};
+
+// Tests a simple straight-line yield sequence that requires 3 resumes.
+// See iree/vm/test/async_ops.mlir > @yield_sequence
+TEST_F(VMBytecodeDispatchAsyncTest, YieldSequence) {
+  IREE_TRACE_SCOPE();
+
+  iree_vm_function_t function;
+  IREE_ASSERT_OK(iree_vm_module_lookup_function_by_name(
+      bytecode_module_, IREE_VM_FUNCTION_LINKAGE_EXPORT,
+      IREE_SV("yield_sequence"), &function));
+  IREE_VM_INLINE_STACK_INITIALIZE(
+      stack, IREE_VM_CONTEXT_FLAG_NONE, iree_vm_context_id(context_),
+      iree_vm_context_state_resolver(context_), iree_allocator_system());
+
+  uint32_t arg_value = 97;
+  uint32_t ret_value = 0;
+
+  iree_vm_function_call_t call;
+  memset(&call, 0, sizeof(call));
+  call.function = function;
+  call.arguments = iree_make_byte_span(&arg_value, sizeof(arg_value));
+  call.results = iree_make_byte_span(&ret_value, sizeof(ret_value));
+  iree_vm_execution_result_t result;
+
+  // 0/3
+  ASSERT_THAT(
+      function.module->begin_call(function.module->self, stack, &call, &result),
+      StatusIs(StatusCode::kDeferred));
+
+  // 1/3
+  ASSERT_THAT(function.module->resume_call(function.module->self, stack,
+                                           call.results, &result),
+              StatusIs(StatusCode::kDeferred));
+
+  // 2/3
+  ASSERT_THAT(function.module->resume_call(function.module->self, stack,
+                                           call.results, &result),
+              StatusIs(StatusCode::kDeferred));
+
+  // 3/3
+  IREE_ASSERT_OK(function.module->resume_call(function.module->self, stack,
+                                              call.results, &result));
+
+  ASSERT_EQ(ret_value, arg_value + 3);
+
+  iree_vm_stack_deinitialize(stack);
+}
+
+// Tests a yield with data-dependent control, ensuring that we run the
+// alternating branches and pass along branch args on resume.
+// See iree/vm/test/async_ops.mlir > @yield_divergent
+TEST_F(VMBytecodeDispatchAsyncTest, YieldDivergent) {
+  IREE_TRACE_SCOPE();
+
+  iree_vm_function_t function;
+  IREE_ASSERT_OK(iree_vm_module_lookup_function_by_name(
+      bytecode_module_, IREE_VM_FUNCTION_LINKAGE_EXPORT,
+      IREE_SV("yield_divergent"), &function));
+  IREE_VM_INLINE_STACK_INITIALIZE(
+      stack, IREE_VM_CONTEXT_FLAG_NONE, iree_vm_context_id(context_),
+      iree_vm_context_state_resolver(context_), iree_allocator_system());
+
+  // result = %arg0 ? %arg1 : %arg2
+  struct {
+    uint32_t arg0;
+    uint32_t arg1;
+    uint32_t arg2;
+  } arg_values = {
+      0,
+      100,
+      200,
+  };
+  uint32_t ret_value = 0;
+
+  iree_vm_function_call_t call;
+  memset(&call, 0, sizeof(call));
+  call.function = function;
+  call.arguments = iree_make_byte_span(&arg_values, sizeof(arg_values));
+  call.results = iree_make_byte_span(&ret_value, sizeof(ret_value));
+  iree_vm_execution_result_t result;
+
+  // arg0=0: result = %arg0 ? %arg1 : %arg2 => %arg2
+  arg_values.arg0 = 0;
+  ASSERT_THAT(
+      function.module->begin_call(function.module->self, stack, &call, &result),
+      StatusIs(StatusCode::kDeferred));
+  IREE_ASSERT_OK(function.module->resume_call(function.module->self, stack,
+                                              call.results, &result));
+  ASSERT_EQ(ret_value, arg_values.arg2);
+
+  // arg0=1: result = %arg0 ? %arg1 : %arg2 => %arg1
+  arg_values.arg0 = 1;
+  ASSERT_THAT(
+      function.module->begin_call(function.module->self, stack, &call, &result),
+      StatusIs(StatusCode::kDeferred));
+  IREE_ASSERT_OK(function.module->resume_call(function.module->self, stack,
+                                              call.results, &result));
+  ASSERT_EQ(ret_value, arg_values.arg1);
+
+  iree_vm_stack_deinitialize(stack);
+}
+
+}  // namespace
+}  // namespace iree
diff --git a/runtime/src/iree/vm/bytecode_dispatch_util.h b/runtime/src/iree/vm/bytecode_dispatch_util.h
index 3109ff6..3b073d2 100644
--- a/runtime/src/iree/vm/bytecode_dispatch_util.h
+++ b/runtime/src/iree/vm/bytecode_dispatch_util.h
@@ -79,6 +79,9 @@
 // NOTE: we cannot store pointers to the stack in here as the stack may be
 // reallocated.
 typedef struct iree_vm_bytecode_frame_storage_t {
+  // Calling convention results fragment.
+  iree_string_view_t cconv_results;
+
   // Pointer to a register list within the stack frame where return registers
   // will be stored by callees upon return.
   const iree_vm_register_list_t* return_registers;
diff --git a/runtime/src/iree/vm/bytecode_module.c b/runtime/src/iree/vm/bytecode_module.c
index 92a10ee..070f129 100644
--- a/runtime/src/iree/vm/bytecode_module.c
+++ b/runtime/src/iree/vm/bytecode_module.c
@@ -1030,8 +1030,31 @@
 
   // Jump into the dispatch routine to execute bytecode until the function
   // either returns (synchronous) or yields (asynchronous).
-  iree_status_t status = iree_vm_bytecode_dispatch(
+  IREE_TRACE_FIBER_ENTER(iree_vm_stack_context_id(stack));
+  iree_status_t status = iree_vm_bytecode_dispatch_begin(
       stack, module, call, cconv_arguments, cconv_results, out_result);
+  IREE_TRACE_FIBER_LEAVE();
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static iree_status_t iree_vm_bytecode_module_resume_call(
+    void* self, iree_vm_stack_t* stack, iree_byte_span_t call_results,
+    iree_vm_execution_result_t* out_result) {
+  // NOTE: any work here adds directly to the invocation time.
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_ASSERT_ARGUMENT(out_result);
+  memset(out_result, 0, sizeof(iree_vm_execution_result_t));
+
+  iree_vm_bytecode_module_t* module = (iree_vm_bytecode_module_t*)self;
+
+  // Resume the call by jumping back into the bytecode dispatch.
+  IREE_TRACE_FIBER_ENTER(iree_vm_stack_context_id(stack));
+  iree_status_t status =
+      iree_vm_bytecode_dispatch_resume(stack, module, call_results, out_result);
+  IREE_TRACE_FIBER_LEAVE();
+
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
@@ -1120,6 +1143,7 @@
   module->interface.resolve_import = iree_vm_bytecode_module_resolve_import;
   module->interface.notify = iree_vm_bytecode_module_notify;
   module->interface.begin_call = iree_vm_bytecode_module_begin_call;
+  module->interface.resume_call = iree_vm_bytecode_module_resume_call;
   module->interface.get_function_reflection_attr =
       iree_vm_bytecode_module_get_function_reflection_attr;
 
diff --git a/runtime/src/iree/vm/bytecode_module_benchmark.cc b/runtime/src/iree/vm/bytecode_module_benchmark.cc
index 8706665..00dd7ec 100644
--- a/runtime/src/iree/vm/bytecode_module_benchmark.cc
+++ b/runtime/src/iree/vm/bytecode_module_benchmark.cc
@@ -106,9 +106,9 @@
       iree_make_byte_span(iree_alloca(result_count * sizeof(int32_t)),
                           result_count * sizeof(int32_t));
 
-  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE,
-                                  iree_vm_context_state_resolver(context),
-                                  iree_allocator_system());
+  IREE_VM_INLINE_STACK_INITIALIZE(
+      stack, IREE_VM_INVOCATION_FLAG_NONE, iree_vm_context_id(context),
+      iree_vm_context_state_resolver(context), iree_allocator_system());
   while (state.KeepRunningBatch(batch_size)) {
     for (iree_host_size_t i = 0; i < i32_args.size(); ++i) {
       reinterpret_cast<int32_t*>(call.arguments.data)[i] = i32_args[i];
diff --git a/runtime/src/iree/vm/bytecode_module_impl.h b/runtime/src/iree/vm/bytecode_module_impl.h
index 14c7547..e045473 100644
--- a/runtime/src/iree/vm/bytecode_module_impl.h
+++ b/runtime/src/iree/vm/bytecode_module_impl.h
@@ -138,15 +138,20 @@
   iree_allocator_t allocator;
 } iree_vm_bytecode_module_state_t;
 
-// Begins (or resumes) execution of the current frame and continues until
-// either a yield or return. |out_result| will contain the result status for
-// continuation, if needed.
-iree_status_t iree_vm_bytecode_dispatch(iree_vm_stack_t* stack,
-                                        iree_vm_bytecode_module_t* module,
-                                        const iree_vm_function_call_t* call,
-                                        iree_string_view_t cconv_arguments,
-                                        iree_string_view_t cconv_results,
-                                        iree_vm_execution_result_t* out_result);
+// Begins execution of the current frame and continues until either a yield or
+// return. |out_result| will contain the result status for continuation, if
+// needed.
+iree_status_t iree_vm_bytecode_dispatch_begin(
+    iree_vm_stack_t* stack, iree_vm_bytecode_module_t* module,
+    const iree_vm_function_call_t* call, iree_string_view_t cconv_arguments,
+    iree_string_view_t cconv_results, iree_vm_execution_result_t* out_result);
+
+// Resumes execution of an in-progress frame and continues until either a yield
+// or return. |out_result| will contain the result status for continuation, if
+// needed.
+iree_status_t iree_vm_bytecode_dispatch_resume(
+    iree_vm_stack_t* stack, iree_vm_bytecode_module_t* module,
+    iree_byte_span_t call_results, iree_vm_execution_result_t* out_result);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/runtime/src/iree/vm/context.c b/runtime/src/iree/vm/context.c
index 3cdec7e..4c965cd 100644
--- a/runtime/src/iree/vm/context.c
+++ b/runtime/src/iree/vm/context.c
@@ -22,7 +22,7 @@
   // An opaque ID unique for the entire process lifetime.
   // If tracing then this points at a NUL-terminated string with process
   // lifetime.
-  intptr_t context_id;
+  iree_vm_context_id_t context_id;
 
   // Context has been frozen and can no longer be modified.
   uint32_t is_frozen : 1;
@@ -43,7 +43,7 @@
 static void iree_vm_context_destroy(iree_vm_context_t* context);
 
 // Allocates a process-unique ID for a context to use.
-static intptr_t iree_vm_context_allocate_id(void) {
+static iree_vm_context_id_t iree_vm_context_allocate_id(void) {
   static iree_atomic_int32_t next_context_id = IREE_ATOMIC_VAR_INIT(1);
   uint32_t context_id = iree_atomic_fetch_add_int32(&next_context_id, 1,
                                                     iree_memory_order_seq_cst);
@@ -54,9 +54,9 @@
   char* name = (char*)malloc(32);
   snprintf(name, 32, "ctx-%04d", context_id - 1);
   IREE_LEAK_CHECK_DISABLE_POP();
-  return (intptr_t)name;
+  return (iree_vm_context_id_t)name;
 #else
-  return context_id;
+  return (iree_vm_context_id_t)context_id;
 #endif  // IREE_TRACING_FEATURE_FIBERS
 }
 
@@ -80,10 +80,8 @@
     return status;
   }
 
-  IREE_TRACE_FIBER_ENTER((char*)iree_vm_context_id(context));
   iree_vm_execution_result_t result;
   status = module->begin_call(module->self, stack, &call, &result);
-  IREE_TRACE_FIBER_LEAVE();
   if (!iree_status_is_ok(status)) {
     status = IREE_VM_STACK_ANNOTATE_BACKTRACE_IF_ENABLED(stack, status);
   }
@@ -207,7 +205,8 @@
       context->flags & IREE_VM_CONTEXT_FLAG_TRACE_EXECUTION
           ? IREE_VM_INVOCATION_FLAG_TRACE_EXECUTION
           : IREE_VM_INVOCATION_FLAG_NONE,
-      iree_vm_context_state_resolver(context), context->allocator);
+      iree_vm_context_id(context), iree_vm_context_state_resolver(context),
+      context->allocator);
   for (int i = (int)end; i >= (int)start; --i) {
     iree_vm_module_t* module = context->list.modules[i];
     iree_vm_module_state_t* module_state = context->list.module_states[i];
@@ -335,7 +334,8 @@
   }
 }
 
-IREE_API_EXPORT intptr_t iree_vm_context_id(const iree_vm_context_t* context) {
+IREE_API_EXPORT iree_vm_context_id_t
+iree_vm_context_id(const iree_vm_context_t* context) {
   if (!context) {
     return -1;
   }
@@ -410,7 +410,8 @@
       context->flags & IREE_VM_CONTEXT_FLAG_TRACE_EXECUTION
           ? IREE_VM_INVOCATION_FLAG_TRACE_EXECUTION
           : IREE_VM_INVOCATION_FLAG_NONE,
-      iree_vm_context_state_resolver(context), context->allocator);
+      iree_vm_context_id(context), iree_vm_context_state_resolver(context),
+      context->allocator);
 
   // Retain all modules and allocate their state.
   assert(context->list.capacity >= context->list.count + module_count);
@@ -628,7 +629,8 @@
       context->flags & IREE_VM_CONTEXT_FLAG_TRACE_EXECUTION
           ? IREE_VM_INVOCATION_FLAG_TRACE_EXECUTION
           : IREE_VM_INVOCATION_FLAG_NONE,
-      iree_vm_context_state_resolver(context), context->allocator);
+      iree_vm_context_id(context), iree_vm_context_state_resolver(context),
+      context->allocator);
 
   // Resumes are walked forward while suspends are walked backward.
   // This follows the expected construction/destruction pattern where for
diff --git a/runtime/src/iree/vm/context.h b/runtime/src/iree/vm/context.h
index a2313fe..7867781 100644
--- a/runtime/src/iree/vm/context.h
+++ b/runtime/src/iree/vm/context.h
@@ -73,7 +73,8 @@
 IREE_API_EXPORT void iree_vm_context_release(iree_vm_context_t* context);
 
 // Returns a process-unique ID for the |context|.
-IREE_API_EXPORT intptr_t iree_vm_context_id(const iree_vm_context_t* context);
+IREE_API_EXPORT iree_vm_context_id_t
+iree_vm_context_id(const iree_vm_context_t* context);
 
 // Returns |context| flags.
 IREE_API_EXPORT iree_vm_context_flags_t
diff --git a/runtime/src/iree/vm/invocation.c b/runtime/src/iree/vm/invocation.c
index a95788e..8622474 100644
--- a/runtime/src/iree/vm/invocation.c
+++ b/runtime/src/iree/vm/invocation.c
@@ -16,6 +16,10 @@
 #include "iree/vm/stack.h"
 #include "iree/vm/value.h"
 
+//===----------------------------------------------------------------------===//
+// Invocation utilities for I/O
+//===----------------------------------------------------------------------===//
+
 // Marshals caller arguments from the variant list to the ABI convention.
 static iree_status_t iree_vm_invoke_marshal_inputs(
     iree_string_view_t cconv_arguments, iree_vm_list_t* inputs,
@@ -144,6 +148,10 @@
   return iree_ok_status();
 }
 
+//===----------------------------------------------------------------------===//
+// Synchronous invocations
+//===----------------------------------------------------------------------===//
+
 // TODO(benvanik): implement this as an iree_vm_invocation_t sequence.
 static iree_status_t iree_vm_invoke_within(
     iree_vm_context_t* context, iree_vm_stack_t* stack,
@@ -214,8 +222,9 @@
   }
 
   // Allocate a VM stack on the host stack and initialize it.
-  IREE_VM_INLINE_STACK_INITIALIZE(
-      stack, flags, iree_vm_context_state_resolver(context), allocator);
+  IREE_VM_INLINE_STACK_INITIALIZE(stack, flags, iree_vm_context_id(context),
+                                  iree_vm_context_state_resolver(context),
+                                  allocator);
   iree_status_t status =
       iree_vm_invoke_within(context, stack, function, policy, inputs, outputs);
   if (!iree_status_is_ok(status)) {
diff --git a/runtime/src/iree/vm/invocation.h b/runtime/src/iree/vm/invocation.h
index 9de07b2..8d92ec1 100644
--- a/runtime/src/iree/vm/invocation.h
+++ b/runtime/src/iree/vm/invocation.h
@@ -21,7 +21,14 @@
 typedef struct iree_vm_invocation_t iree_vm_invocation_t;
 typedef struct iree_vm_invocation_policy_t iree_vm_invocation_policy_t;
 
+//===----------------------------------------------------------------------===//
+// Synchronous invocation
+//===----------------------------------------------------------------------===//
+
 // Synchronously invokes a function in the VM.
+// The function will be run to completion and may block on external resources.
+// If more control is required or callers want to have multiple invocations
+// in-flight then iree_vm_invocation_t should be used.
 //
 // |policy| is used to schedule the invocation relative to other pending or
 // in-flight invocations. It may be omitted to leave the behavior up to the
@@ -40,6 +47,10 @@
     iree_vm_list_t* inputs, iree_vm_list_t* outputs,
     iree_allocator_t allocator);
 
+//===----------------------------------------------------------------------===//
+// Asynchronous stateful invocation
+//===----------------------------------------------------------------------===//
+
 // TODO(benvanik): document and implement.
 IREE_API_EXPORT iree_status_t iree_vm_invocation_create(
     iree_vm_context_t* context, iree_vm_function_t function,
@@ -58,20 +69,20 @@
 // Queries the completion status of the invocation.
 // Returns one of the following:
 //   IREE_STATUS_OK: the invocation completed successfully.
-//   IREE_STATUS_UNAVAILABLE: the invocation has not yet completed.
-//   IREE_STATUS_CANCELLED: the invocation was cancelled internally.
-//   IREE_STATUS_ABORTED: the invocation was aborted.
+//   IREE_STATUS_DEFERRED: the invocation has not yet completed.
+//   IREE_STATUS_CANCELLED: the invocation was cancelled by the user.
+//   IREE_STATUS_ABORTED: the invocation was aborted by the executor.
 //   IREE_STATUS_*: an error occurred during invocation.
 IREE_API_EXPORT iree_status_t
 iree_vm_invocation_query_status(iree_vm_invocation_t* invocation);
 
-// Returns a reference to the output of the invocation.
+// Returns a reference to the outputs of the invocation.
 // The returned structure is valid for the lifetime of the invocation and
 // callers must retain any refs they want to outlive the invocation once
 // released.
 //
-// Returns NULL if the invocation did not complete successfully.
-IREE_API_EXPORT const iree_vm_list_t* iree_vm_invocation_output(
+// Returns NULL if the invocation has not yet completed or if it failed.
+IREE_API_EXPORT const iree_vm_list_t* iree_vm_invocation_outputs(
     iree_vm_invocation_t* invocation);
 
 // Blocks the caller until the invocation completes (successfully or otherwise).
@@ -81,10 +92,11 @@
 IREE_API_EXPORT iree_status_t iree_vm_invocation_await(
     iree_vm_invocation_t* invocation, iree_time_t deadline);
 
-// Attempts to abort the invocation if it is in-flight.
+// Attempts to cancel the invocation if it is in-flight.
+// Cancellation is not guaranteed to work and should be considered a hint.
 // A no-op if the invocation has already completed.
-IREE_API_EXPORT iree_status_t
-iree_vm_invocation_abort(iree_vm_invocation_t* invocation);
+IREE_API_EXPORT void iree_vm_invocation_cancel(
+    iree_vm_invocation_t* invocation);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/runtime/src/iree/vm/module.h b/runtime/src/iree/vm/module.h
index 55c858e..9baef24 100644
--- a/runtime/src/iree/vm/module.h
+++ b/runtime/src/iree/vm/module.h
@@ -397,15 +397,29 @@
                                       iree_vm_signal_t signal);
 
   // Begins a function call with the given |call| arguments.
-  // Execution may yield in the case of asynchronous code and require one or
-  // more calls to the resume method to complete.
+  //
+  // Returns OK if execution completes immediately. If the call completes
+  // immediately the results will be written to |call|->results.
+  //
+  // Returns IREE_STATUS_DEFERRED if execution yielded and the call needs to be
+  // resumed. Depending on the program it may be unsafe to begin any other calls
+  // without first completing prior ones. |out_result| will contain information
+  // for when to reschedule the call.
   iree_status_t(IREE_API_PTR* begin_call)(
       void* self, iree_vm_stack_t* stack, const iree_vm_function_call_t* call,
       iree_vm_execution_result_t* out_result);
 
   // Resumes execution of a previously-yielded call.
+  //
+  // Returns OK if execution completes immediately. If the call completes
+  // immediately the results will be written to |call|->results.
+  //
+  // Returns IREE_STATUS_DEFERRED if execution yielded and the call needs to be
+  // resumed. Depending on the program it may be unsafe to begin any other calls
+  // without first completing prior ones. |out_result| will contain information
+  // for when to reschedule the call.
   iree_status_t(IREE_API_PTR* resume_call)(
-      void* self, iree_vm_stack_t* stack,
+      void* self, iree_vm_stack_t* stack, iree_byte_span_t call_results,
       iree_vm_execution_result_t* out_result);
 
   // TODO(benvanik): move this/refactor.
diff --git a/runtime/src/iree/vm/native_module.c b/runtime/src/iree/vm/native_module.c
index eff076a..8553435 100644
--- a/runtime/src/iree/vm/native_module.c
+++ b/runtime/src/iree/vm/native_module.c
@@ -336,12 +336,13 @@
   return iree_vm_stack_function_leave(stack);
 }
 
-static iree_status_t IREE_API_PTR
-iree_vm_native_module_resume_call(void* self, iree_vm_stack_t* stack,
-                                  iree_vm_execution_result_t* out_result) {
+static iree_status_t IREE_API_PTR iree_vm_native_module_resume_call(
+    void* self, iree_vm_stack_t* stack, iree_byte_span_t call_results,
+    iree_vm_execution_result_t* out_result) {
   iree_vm_native_module_t* module = (iree_vm_native_module_t*)self;
   if (module->user_interface.resume_call) {
-    return module->user_interface.resume_call(module->self, stack, out_result);
+    return module->user_interface.resume_call(module->self, stack, call_results,
+                                              out_result);
   }
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                           "native module does not support resume");
diff --git a/runtime/src/iree/vm/stack.c b/runtime/src/iree/vm/stack.c
index 90efaf2..d7ef366 100644
--- a/runtime/src/iree/vm/stack.c
+++ b/runtime/src/iree/vm/stack.c
@@ -181,6 +181,11 @@
   // may transition to owning it on dynamic growth.
   bool owns_frame_storage;
 
+  // An opaque ID unique for the entire process lifetime.
+  // If tracing then this points at a NUL-terminated string with process
+  // lifetime.
+  iree_vm_context_id_t context_id;
+
   // Resolves a module to a module state within a context.
   // This will be called on function entry whenever module transitions occur.
   iree_vm_state_resolver_t state_resolver;
@@ -196,8 +201,8 @@
 
 IREE_API_EXPORT iree_status_t iree_vm_stack_initialize(
     iree_byte_span_t storage, iree_vm_invocation_flags_t flags,
-    iree_vm_state_resolver_t state_resolver, iree_allocator_t allocator,
-    iree_vm_stack_t** out_stack) {
+    iree_vm_context_id_t context_id, iree_vm_state_resolver_t state_resolver,
+    iree_allocator_t allocator, iree_vm_stack_t** out_stack) {
   IREE_ASSERT_ARGUMENT(out_stack);
   *out_stack = NULL;
   if (storage.data_length < IREE_VM_STACK_MIN_SIZE) {
@@ -213,6 +218,7 @@
   memset(stack, 0, sizeof(iree_vm_stack_t));
   stack->owns_frame_storage = false;
   stack->flags = flags;
+  stack->context_id = context_id;
   stack->state_resolver = state_resolver;
   stack->allocator = allocator;
 
@@ -245,8 +251,9 @@
 }
 
 IREE_API_EXPORT iree_status_t iree_vm_stack_allocate(
-    iree_vm_invocation_flags_t flags, iree_vm_state_resolver_t state_resolver,
-    iree_allocator_t allocator, iree_vm_stack_t** out_stack) {
+    iree_vm_invocation_flags_t flags, iree_vm_context_id_t context_id,
+    iree_vm_state_resolver_t state_resolver, iree_allocator_t allocator,
+    iree_vm_stack_t** out_stack) {
   IREE_TRACE_ZONE_BEGIN(z0);
 
   *out_stack = NULL;
@@ -258,8 +265,8 @@
   iree_vm_stack_t* stack = NULL;
   if (iree_status_is_ok(status)) {
     iree_byte_span_t storage_span = iree_make_byte_span(storage, storage_size);
-    status = iree_vm_stack_initialize(storage_span, flags, state_resolver,
-                                      allocator, &stack);
+    status = iree_vm_stack_initialize(storage_span, flags, context_id,
+                                      state_resolver, allocator, &stack);
   }
 
   *out_stack = stack;
@@ -283,6 +290,11 @@
   return stack->flags;
 }
 
+IREE_API_EXPORT iree_vm_context_id_t
+iree_vm_stack_context_id(const iree_vm_stack_t* stack) {
+  return stack->context_id;
+}
+
 IREE_API_EXPORT iree_vm_stack_frame_t* iree_vm_stack_current_frame(
     iree_vm_stack_t* stack) {
   return stack->top ? &stack->top->frame : NULL;
diff --git a/runtime/src/iree/vm/stack.h b/runtime/src/iree/vm/stack.h
index abd5b75..582c02e 100644
--- a/runtime/src/iree/vm/stack.h
+++ b/runtime/src/iree/vm/stack.h
@@ -52,6 +52,8 @@
 };
 typedef uint32_t iree_vm_invocation_flags_t;
 
+typedef intptr_t iree_vm_context_id_t;
+
 typedef enum iree_vm_stack_frame_type_e {
   // Represents an `[external]` frame that needs to marshal args/results.
   // These frames have no source location and are tracked so that we know when
@@ -126,18 +128,20 @@
 //  IREE_VM_INLINE_STACK_INITIALIZE(
 //      stack,
 //      IREE_VM_INVOCATION_FLAG_NONE,
+//      iree_vm_context_id(context),
 //      iree_vm_context_state_resolver(context),
 //      iree_allocator_system());
 //  ...
 //  iree_vm_stack_deinitialize(stack);
-#define IREE_VM_INLINE_STACK_INITIALIZE(stack, flags, state_resolver, \
-                                        allocator)                    \
-  uint8_t __stack_storage[IREE_VM_STACK_DEFAULT_SIZE];                \
-  iree_byte_span_t __stack_storage_span =                             \
-      iree_make_byte_span(__stack_storage, sizeof(__stack_storage));  \
-  iree_vm_stack_t* stack = NULL;                                      \
-  IREE_IGNORE_ERROR(iree_vm_stack_initialize(                         \
-      __stack_storage_span, (flags), (state_resolver), (allocator), &stack));
+#define IREE_VM_INLINE_STACK_INITIALIZE(stack, flags, context_id,            \
+                                        state_resolver, allocator)           \
+  uint8_t __stack_storage[IREE_VM_STACK_DEFAULT_SIZE];                       \
+  iree_byte_span_t __stack_storage_span =                                    \
+      iree_make_byte_span(__stack_storage, sizeof(__stack_storage));         \
+  iree_vm_stack_t* stack = NULL;                                             \
+  IREE_IGNORE_ERROR(iree_vm_stack_initialize(__stack_storage_span, (flags),  \
+                                             (context_id), (state_resolver), \
+                                             (allocator), &stack));
 
 // Initializes a statically-allocated stack in |storage|.
 // The contents of the |storage| can be anything upon initialization and the
@@ -160,8 +164,8 @@
 //  // stack_storage can now be reused/freed/etc
 IREE_API_EXPORT iree_status_t iree_vm_stack_initialize(
     iree_byte_span_t storage, iree_vm_invocation_flags_t flags,
-    iree_vm_state_resolver_t state_resolver, iree_allocator_t allocator,
-    iree_vm_stack_t** out_stack);
+    iree_vm_context_id_t context_id, iree_vm_state_resolver_t state_resolver,
+    iree_allocator_t allocator, iree_vm_stack_t** out_stack);
 
 // Deinitializes a statically-allocated |stack| previously initialized with
 // iree_vm_stack_initialize.
@@ -182,8 +186,9 @@
 //  ...
 //  iree_vm_stack_free(stack);
 IREE_API_EXPORT iree_status_t iree_vm_stack_allocate(
-    iree_vm_invocation_flags_t flags, iree_vm_state_resolver_t state_resolver,
-    iree_allocator_t allocator, iree_vm_stack_t** out_stack);
+    iree_vm_invocation_flags_t flags, iree_vm_context_id_t context_id,
+    iree_vm_state_resolver_t state_resolver, iree_allocator_t allocator,
+    iree_vm_stack_t** out_stack);
 
 // Frees a dynamically-allocated |stack| from iree_vm_stack_allocate.
 IREE_API_EXPORT void iree_vm_stack_free(iree_vm_stack_t* stack);
@@ -192,6 +197,10 @@
 IREE_API_EXPORT iree_vm_invocation_flags_t
 iree_vm_stack_invocation_flags(const iree_vm_stack_t* stack);
 
+// Returns the process-unique context ID.
+IREE_API_EXPORT iree_vm_context_id_t
+iree_vm_stack_context_id(const iree_vm_stack_t* stack);
+
 // Returns the current stack frame or nullptr if the stack is empty.
 IREE_API_EXPORT iree_vm_stack_frame_t* iree_vm_stack_current_frame(
     iree_vm_stack_t* stack);
diff --git a/runtime/src/iree/vm/stack_test.cc b/runtime/src/iree/vm/stack_test.cc
index 80303df..acf557d 100644
--- a/runtime/src/iree/vm/stack_test.cc
+++ b/runtime/src/iree/vm/stack_test.cc
@@ -37,7 +37,7 @@
 // Tests simple stack usage, mainly just for demonstration.
 TEST(VMStackTest, Usage) {
   iree_vm_state_resolver_t state_resolver = {nullptr, SentinelStateResolver};
-  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE,
+  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE, 0,
                                   state_resolver, iree_allocator_system());
 
   EXPECT_EQ(nullptr, iree_vm_stack_current_frame(stack));
@@ -74,7 +74,7 @@
 // Tests stack cleanup with unpopped frames (like during failure teardown).
 TEST(VMStackTest, DeinitWithRemainingFrames) {
   iree_vm_state_resolver_t state_resolver = {nullptr, SentinelStateResolver};
-  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE,
+  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE, 0,
                                   state_resolver, iree_allocator_system());
 
   iree_vm_function_t function_a = {MODULE_A_SENTINEL,
@@ -93,7 +93,7 @@
 // Tests stack overflow detection.
 TEST(VMStackTest, StackOverflow) {
   iree_vm_state_resolver_t state_resolver = {nullptr, SentinelStateResolver};
-  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE,
+  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE, 0,
                                   state_resolver, iree_allocator_system());
 
   EXPECT_EQ(nullptr, iree_vm_stack_current_frame(stack));
@@ -123,7 +123,7 @@
 // Tests unbalanced stack popping.
 TEST(VMStackTest, UnbalancedPop) {
   iree_vm_state_resolver_t state_resolver = {nullptr, SentinelStateResolver};
-  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE,
+  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE, 0,
                                   state_resolver, iree_allocator_system());
 
   iree_status_t status = iree_vm_stack_function_leave(stack);
@@ -136,7 +136,7 @@
 // Tests module state reuse and querying.
 TEST(VMStackTest, ModuleStateQueries) {
   iree_vm_state_resolver_t state_resolver = {nullptr, SentinelStateResolver};
-  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE,
+  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE, 0,
                                   state_resolver, iree_allocator_system());
 
   EXPECT_EQ(nullptr, iree_vm_stack_current_frame(stack));
@@ -185,7 +185,7 @@
         // NOTE: always failing.
         return iree_make_status(IREE_STATUS_INTERNAL);
       }};
-  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE,
+  IREE_VM_INLINE_STACK_INITIALIZE(stack, IREE_VM_INVOCATION_FLAG_NONE, 0,
                                   state_resolver, iree_allocator_system());
 
   // Push should fail if we can't query state, status should propagate.
diff --git a/runtime/src/iree/vm/test/BUILD b/runtime/src/iree/vm/test/BUILD
index 44c0154..84617ad 100644
--- a/runtime/src/iree/vm/test/BUILD
+++ b/runtime/src/iree/vm/test/BUILD
@@ -271,3 +271,22 @@
         "--compile-mode=vm",
     ],
 )
+
+c_embed_data(
+    name = "async_bytecode_modules_c",
+    srcs = [
+        ":async_ops.vmfb",
+    ],
+    c_file_output = "async_bytecode_modules.c",
+    flatten = True,
+    h_file_output = "async_bytecode_modules.h",
+)
+
+iree_bytecode_module(
+    name = "async_ops",
+    src = "async_ops.mlir",
+    compile_tool = "//tools:iree-compile",
+    flags = [
+        "--compile-mode=vm",
+    ],
+)
diff --git a/runtime/src/iree/vm/test/CMakeLists.txt b/runtime/src/iree/vm/test/CMakeLists.txt
index 1c15017..1f99350 100644
--- a/runtime/src/iree/vm/test/CMakeLists.txt
+++ b/runtime/src/iree/vm/test/CMakeLists.txt
@@ -338,4 +338,29 @@
   PUBLIC
 )
 
+iree_c_embed_data(
+  NAME
+    async_bytecode_modules_c
+  GENERATED_SRCS
+    "async_ops.vmfb"
+  C_FILE_OUTPUT
+    "async_bytecode_modules.c"
+  H_FILE_OUTPUT
+    "async_bytecode_modules.h"
+  FLATTEN
+  PUBLIC
+)
+
+iree_bytecode_module(
+  NAME
+    async_ops
+  SRC
+    "async_ops.mlir"
+  COMPILE_TOOL
+    iree-compile
+  FLAGS
+    "--compile-mode=vm"
+  PUBLIC
+)
+
 ### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/runtime/src/iree/vm/test/async_ops.mlir b/runtime/src/iree/vm/test/async_ops.mlir
new file mode 100644
index 0000000..2b9fb86
--- /dev/null
+++ b/runtime/src/iree/vm/test/async_ops.mlir
@@ -0,0 +1,53 @@
+// Tested by iree/vm/bytecode_dispatch_async_test.cc.
+//
+// NOTE: we don't want to rely on vm.check.* and the main runner here for
+// testing as it makes it hard to test failure cases; a test that doesn't run
+// because we don't resume from the caller would look like a success. The test
+// runner has the other half of this code with the expectations.
+
+vm.module @async_ops {
+
+  //===--------------------------------------------------------------------===//
+  // vm.yield
+  //===--------------------------------------------------------------------===//
+
+  // Tests a simple straight-line yield sequence that requires 3 resumes.
+  //
+  // Expects a result of %arg0 + 3.
+  vm.export @yield_sequence
+  vm.func @yield_sequence(%arg0: i32) -> i32 {
+    %c1 = vm.const.i32 1
+    %y0 = vm.add.i32 %arg0, %c1 : i32
+    %y0_dno = util.do_not_optimize(%y0) : i32
+    vm.yield ^bb1
+  ^bb1:
+    %y1 = vm.add.i32 %y0_dno, %c1 : i32
+    %y1_dno = util.do_not_optimize(%y1) : i32
+    vm.yield ^bb2
+  ^bb2:
+    %y2 = vm.add.i32 %y1_dno, %c1 : i32
+    %y2_dno = util.do_not_optimize(%y2) : i32
+    vm.yield ^bb3
+  ^bb3:
+    vm.return %y2_dno : i32
+  }
+
+  // Tests a yield with data-dependent control, ensuring that we run the
+  // alternating branches and pass along branch args on resume.
+  //
+  // Expects a result of %arg0 ? %arg1 : %arg2.
+  vm.export @yield_divergent
+  vm.func @yield_divergent(%arg0: i32, %arg1: i32, %arg2: i32) -> i32 {
+    %cond = vm.cmp.nz.i32 %arg0 : i32
+    vm.cond_br %cond, ^true, ^false
+  ^true:
+    %arg1_dno = util.do_not_optimize(%arg1) : i32
+    vm.yield ^bb3(%arg1_dno : i32)
+  ^false:
+    %arg2_dno = util.do_not_optimize(%arg2) : i32
+    vm.yield ^bb3(%arg2_dno: i32)
+  ^bb3(%result : i32):
+    vm.return %result : i32
+  }
+
+}
diff --git a/tools/iree-benchmark-module-main.cc b/tools/iree-benchmark-module-main.cc
index a20020b..784d9a3 100644
--- a/tools/iree-benchmark-module-main.cc
+++ b/tools/iree-benchmark-module-main.cc
@@ -339,8 +339,8 @@
     IREE_TRACE_SCOPE0("IREEBenchmark::RegisterSpecificFunction");
 
     iree_vm_function_t function;
-    IREE_RETURN_IF_ERROR(input_module_->lookup_function(
-        input_module_->self, IREE_VM_FUNCTION_LINKAGE_EXPORT,
+    IREE_RETURN_IF_ERROR(iree_vm_module_lookup_function_by_name(
+        input_module_, IREE_VM_FUNCTION_LINKAGE_EXPORT,
         iree_string_view_t{function_name.data(), function_name.size()},
         &function));