Implementing basic `--iree-execution-model=async-external` support.
Currently only coarse fences are supported: when the flag is specified
exported functions will take a wait and signal fence pair. Upon return to
a caller execution is not assumed to have completed and the caller can
either wait on the signal fence or chain further invocations with it.
Future invocation models will support specifying an arbitrary set of
fences that allow for up to per-I/O granularity (attention layers could
signal sooner than full decoders, etc) and specifying fences for in-place
buffers (wait until buffer is available to write in before filling).

This initial version is conservative and may include additional queue
barriers in order to signal the user-provided fence but future
improvements to timepoint elision and IPO will make that better. Nearly
all models we work with today end up becoming async with the current
heuristics.

iree-run-module/mlir has been updated to support programs compiled with
the async-external mode. iree-benchmark-module now supports pipelined
and concurrent execution via the --batch_size= and --batch_concurrency=
flags: batch_size defines how many invocations there are and
batch_concurrency defines how many of those are able to run concurrently.
Examples:
--batch_size=1 --batch_concurrency=1: default single-shot invocation
--batch_size=4 --batch_concurrency=1: 4 sequential invocations
--batch_size=4 --batch_concurrency=4: 4 concurrent invocations
--batch_size=4 --batch_concurrency=2: 2 concurrent sequences of 2 invocations
diff --git a/compiler/src/iree/compiler/Bindings/Native/Transforms/Passes.cpp b/compiler/src/iree/compiler/Bindings/Native/Transforms/Passes.cpp
index 65dd8e4..91f917b 100644
--- a/compiler/src/iree/compiler/Bindings/Native/Transforms/Passes.cpp
+++ b/compiler/src/iree/compiler/Bindings/Native/Transforms/Passes.cpp
@@ -18,9 +18,11 @@
 namespace IREE {
 namespace ABI {
 
-void buildTransformPassPipeline(OpPassManager &passManager) {
+void buildTransformPassPipeline(OpPassManager &passManager,
+                                const InvocationOptions &invocationOptions) {
   // Wraps the entry points in an export function.
-  passManager.addPass(createWrapEntryPointsPass());
+  passManager.addPass(
+      createWrapEntryPointsPass(invocationOptions.invocationModel));
 
   // Cleanup the IR after manipulating it.
   passManager.addPass(createInlinerPass());
@@ -30,11 +32,12 @@
 }
 
 void registerTransformPassPipeline() {
-  PassPipelineRegistration<> transformPassPipeline(
+  PassPipelineRegistration<InvocationOptions> transformPassPipeline(
       "iree-abi-transformation-pipeline",
       "Runs the IREE native ABI bindings support pipeline",
-      [](OpPassManager &passManager) {
-        buildTransformPassPipeline(passManager);
+      [](OpPassManager &passManager,
+         const InvocationOptions &invocationOptions) {
+        buildTransformPassPipeline(passManager, invocationOptions);
       });
 }
 
diff --git a/compiler/src/iree/compiler/Bindings/Native/Transforms/Passes.h b/compiler/src/iree/compiler/Bindings/Native/Transforms/Passes.h
index 3bfedd3..a50ff44 100644
--- a/compiler/src/iree/compiler/Bindings/Native/Transforms/Passes.h
+++ b/compiler/src/iree/compiler/Bindings/Native/Transforms/Passes.h
@@ -18,12 +18,36 @@
 namespace ABI {
 
 //===----------------------------------------------------------------------===//
-// Helpers
+// Pipelines
 //===----------------------------------------------------------------------===//
 
+// Specifies the execution model used for invocations.
+enum class InvocationModel {
+  // Fully synchronous behavior with no fences.
+  Sync,
+  // Exposes one wait fence for all inputs and one signal fence for all outputs.
+  CoarseFences,
+};
+
+struct InvocationOptions : public PassPipelineOptions<InvocationOptions> {
+  Option<InvocationModel> invocationModel{
+      *this,
+      "invocation-model",
+      llvm::cl::desc("Specifies the execution model used for invocations."),
+      llvm::cl::init(IREE::ABI::InvocationModel::Sync),
+      llvm::cl::values(
+          clEnumValN(IREE::ABI::InvocationModel::Sync, "sync",
+                     "Fully synchronous behavior with no fences."),
+          clEnumValN(IREE::ABI::InvocationModel::CoarseFences, "coarse-fences",
+                     "Exposes one wait fence for all inputs and one signal "
+                     "fence for all outputs.")),
+  };
+};
+
 // Adds a set of passes to the given pass manager that setup a module for use
 // with bindings following the native IREE ABI.
-void buildTransformPassPipeline(OpPassManager &passManager);
+void buildTransformPassPipeline(OpPassManager &passManager,
+                                const InvocationOptions &invocationOptions);
 
 void registerTransformPassPipeline();
 
@@ -33,7 +57,8 @@
 
 // Wraps all entry points in a function that is compatible with the
 // expected invocation semantics of bindings following the native IREE ABI.
-std::unique_ptr<OperationPass<ModuleOp>> createWrapEntryPointsPass();
+std::unique_ptr<OperationPass<ModuleOp>> createWrapEntryPointsPass(
+    InvocationModel invocationModel = InvocationModel::Sync);
 
 //===----------------------------------------------------------------------===//
 // Register all Passes
diff --git a/compiler/src/iree/compiler/Bindings/Native/Transforms/WrapEntryPoints.cpp b/compiler/src/iree/compiler/Bindings/Native/Transforms/WrapEntryPoints.cpp
index a628a52..ac63750 100644
--- a/compiler/src/iree/compiler/Bindings/Native/Transforms/WrapEntryPoints.cpp
+++ b/compiler/src/iree/compiler/Bindings/Native/Transforms/WrapEntryPoints.cpp
@@ -4,6 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+#include "iree/compiler/Bindings/Native/Transforms/Passes.h"
 #include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "llvm/ADT/STLExtras.h"
@@ -23,21 +24,6 @@
 namespace IREE {
 namespace ABI {
 
-// Populates attributes on |wrapperOp| to support runtime reflection.
-static void populateReflectionAttrs(func::FuncOp exportOp,
-                                    func::FuncOp wrapperOp) {
-  SmallVector<NamedAttribute, 4> attrs;
-  auto abiAttr = exportOp->getAttr("iree.abi");
-  if (abiAttr) {
-    attrs.emplace_back(StringAttr::get(exportOp.getContext(), "iree.abi"),
-                       abiAttr);
-  }
-  if (!attrs.empty()) {
-    auto reflectionAttr = DictionaryAttr::get(exportOp.getContext(), attrs);
-    wrapperOp->setAttr("iree.reflection", reflectionAttr);
-  }
-}
-
 // Maps a source type to the native ABI type.
 static Type mapToABIType(Type type) {
   if (type.isa<TensorType>()) {
@@ -108,7 +94,8 @@
 // Updates |importOp| to use the native ABI and creates a wrapper function that
 // preserves the original behavior. All callers will be updated to point at the
 // new wrapper function.
-static LogicalResult wrapImportFunc(mlir::ModuleOp moduleOp,
+static LogicalResult wrapImportFunc(IREE::ABI::InvocationModel invocationModel,
+                                    mlir::ModuleOp moduleOp,
                                     func::FuncOp importOp,
                                     SymbolTable &symbolTable) {
   // Replace all existing calls to the import to instead call the wrapper.
@@ -154,9 +141,46 @@
   return success();
 }
 
+// Populates attributes on |wrapperOp| to support runtime reflection.
+// These are attached to the exported function and can be queried at runtime
+// with iree_vm_function_lookup_attr_by_name.
+static void populateReflectionAttrs(IREE::ABI::InvocationModel invocationModel,
+                                    func::FuncOp exportOp,
+                                    func::FuncOp wrapperOp) {
+  auto *context = exportOp.getContext();
+  SmallVector<NamedAttribute, 4> attrs;
+
+  if (auto abiAttr = exportOp->getAttr("iree.abi")) {
+    attrs.emplace_back(StringAttr::get(context, "iree.abi"), abiAttr);
+  }
+
+  switch (invocationModel) {
+    default:
+    case IREE::ABI::InvocationModel::Sync:
+      break;
+    case IREE::ABI::InvocationModel::CoarseFences:
+      attrs.emplace_back(StringAttr::get(context, "iree.abi.model"),
+                         StringAttr::get(context, "coarse-fences"));
+      break;
+  }
+
+  if (!attrs.empty()) {
+    auto reflectionAttr = DictionaryAttr::get(context, attrs);
+    wrapperOp->setAttr("iree.reflection", reflectionAttr);
+  }
+}
+
 // Creates the corresponding wrapper function for the given export function.
-static func::FuncOp createExportWrapperFunc(func::FuncOp exportOp,
-                                            StringRef publicName) {
+static func::FuncOp createExportWrapperFunc(
+    IREE::ABI::InvocationModel invocationModel, func::FuncOp exportOp,
+    StringRef publicName) {
+  // Copy arg/result attrs from the import op to the wrapper function.
+  // We may want to remove them from the import but would need to filter.
+  SmallVector<DictionaryAttr, 4> argAttrDict;
+  exportOp.getAllArgAttrs(argAttrDict);
+  SmallVector<DictionaryAttr, 4> resultAttrDict;
+  exportOp.getAllResultAttrs(resultAttrDict);
+
   // Convert argument types to those required by the binding ABI.
   //
   // NOTE: this is where we could change our signature to provide additional
@@ -164,9 +188,21 @@
   // async behavior or cancellation.
   auto oldExportType = exportOp.getFunctionType();
   SmallVector<Type> inputTypes;
+  auto fenceType = IREE::HAL::FenceType::get(exportOp.getContext());
   for (auto oldType : oldExportType.getInputs()) {
     inputTypes.push_back(mapToABIType(oldType));
   }
+  switch (invocationModel) {
+    default:
+    case IREE::ABI::InvocationModel::Sync:
+      break;
+    case IREE::ABI::InvocationModel::CoarseFences:
+      inputTypes.push_back(fenceType);  // wait
+      inputTypes.push_back(fenceType);  // signal
+      argAttrDict.push_back(nullptr);   // wait
+      argAttrDict.push_back(nullptr);   // signal
+      break;
+  }
   SmallVector<Type> resultTypes;
   for (auto oldType : oldExportType.getResults()) {
     resultTypes.push_back(mapToABIType(oldType));
@@ -180,18 +216,11 @@
       func::FuncOp::create(exportOp.getLoc(), publicName, newExportType);
   wrapperOp.setPublic();
   wrapperOp->setAttr("iree.abi.stub", UnitAttr::get(exportOp.getContext()));
-
-  // Copy arg/result attrs from the import op to the wrapper function.
-  // We may want to remove them from the import but would need to filter.
-  SmallVector<DictionaryAttr, 4> argAttrDict;
-  exportOp.getAllArgAttrs(argAttrDict);
   wrapperOp.setAllArgAttrs(argAttrDict);
-  SmallVector<DictionaryAttr, 4> resultAttrDict;
-  exportOp.getAllResultAttrs(resultAttrDict);
   wrapperOp.setAllResultAttrs(resultAttrDict);
 
   // Populate the reflection attrs based on the original types.
-  populateReflectionAttrs(exportOp, wrapperOp);
+  populateReflectionAttrs(invocationModel, exportOp, wrapperOp);
 
   auto *entryBlock = wrapperOp.addEntryBlock();
   auto entryBuilder = OpBuilder::atBlockBegin(entryBlock);
@@ -199,7 +228,7 @@
   // Build a map of result value to the argument that has its backing storage.
   SmallVector<Value> resultStorages;
   resultStorages.resize(resultTypes.size());
-  for (unsigned i = 0; i < inputTypes.size(); ++i) {
+  for (unsigned i = 0; i < exportOp.getNumArguments(); ++i) {
     auto outputAttr =
         exportOp.getArgAttrOfType<IntegerAttr>(i, "iree.abi.output");
     if (!outputAttr) continue;
@@ -215,14 +244,31 @@
     resultStorages[outputAttr.getInt()] = storageArg;
   }
 
+  // Build a map of each I/O argument to the fence that covers them.
+  // TODO(benvanik): actually support a map; for now we just handle the 1:M
+  // coarse mode where all inputs are covered by a single wait fence and all
+  // outputs are covered by a single signal fence.
+  Value waitFence;
+  Value signalFence;
+  switch (invocationModel) {
+    default:
+    case IREE::ABI::InvocationModel::Sync:
+      break;
+    case IREE::ABI::InvocationModel::CoarseFences:
+      waitFence = entryBlock->getArgument(entryBlock->getNumArguments() - 2);
+      signalFence = entryBlock->getArgument(entryBlock->getNumArguments() - 1);
+      break;
+  }
+
   // Marshal arguments.
   SmallVector<Value> arguments;
-  for (auto arg : llvm::enumerate(entryBlock->getArguments())) {
+  for (auto arg : llvm::enumerate(
+           entryBlock->getArguments().slice(0, oldExportType.getNumInputs()))) {
     auto oldType = oldExportType.getInput(arg.index());
     if (oldType.isa<TensorType>()) {
       auto argLoc = arg.value().getLoc();
       auto importOp = entryBuilder.create<IREE::HAL::TensorImportOp>(
-          argLoc, oldType, arg.value());
+          argLoc, oldType, arg.value(), waitFence);
       arguments.push_back(importOp.getTarget());
     } else {
       arguments.push_back(arg.value());
@@ -232,21 +278,41 @@
   // Make the call with the original types.
   auto callOp =
       entryBuilder.create<func::CallOp>(exportOp.getLoc(), exportOp, arguments);
+  auto asyncResults = llvm::to_vector(callOp.getResults());
+
+  // Insert a barrier if requested - all tensors will be calculated and the
+  // fence will be signaled. Note that even if there are no tensor results we
+  // need to signal the fence.
+  if (signalFence) {
+    SmallVector<Value> asyncTensors;
+    for (auto result : asyncResults) {
+      if (result.getType().isa<TensorType>()) asyncTensors.push_back(result);
+    }
+    if (asyncTensors.empty()) {
+      // TODO(benvanik): maybe use a global timeline? global stores may not
+      // have completed by now in cases where the user wants to loop back.
+      entryBuilder.create<IREE::HAL::FenceSignalOp>(exportOp.getLoc(),
+                                                    signalFence);
+    } else {
+      auto barrierOp = entryBuilder.create<IREE::HAL::TensorBarrierOp>(
+          exportOp.getLoc(), asyncTensors, signalFence);
+      asyncResults = llvm::to_vector(barrierOp.getResults());
+    }
+  }
 
   // Marshal results.
   SmallVector<Value> results;
-  for (auto result : llvm::enumerate(callOp.getResults())) {
-    auto oldType = oldExportType.getResult(result.index());
-    auto newType = newExportType.getResult(result.index());
+  for (auto [resultIndex, result] : llvm::enumerate(asyncResults)) {
+    auto oldType = oldExportType.getResult(resultIndex);
+    auto newType = newExportType.getResult(resultIndex);
     if (oldType.isa<TensorType>()) {
       auto dynamicDims = IREE::Util::buildDynamicDimsForValue(
-          exportOp.getLoc(), result.value(), entryBuilder);
+          result.getLoc(), result, entryBuilder);
       results.push_back(entryBuilder.create<IREE::HAL::TensorExportOp>(
-          exportOp.getLoc(), newType, result.value(),
-          TypeAttr::get(result.value().getType()), dynamicDims,
-          resultStorages[result.index()]));
+          result.getLoc(), newType, result, TypeAttr::get(result.getType()),
+          dynamicDims, resultStorages[resultIndex]));
     } else {
-      results.push_back(result.value());
+      results.push_back(result);
     }
   }
 
@@ -258,7 +324,8 @@
 // The original function will be made private and be renamed.
 // This allows us to support multiple binding schemes as transforms from other
 // bindings can also perform their own equivalent wrapping.
-static LogicalResult wrapExportFunc(mlir::ModuleOp moduleOp,
+static LogicalResult wrapExportFunc(IREE::ABI::InvocationModel invocationModel,
+                                    mlir::ModuleOp moduleOp,
                                     func::FuncOp exportOp,
                                     SymbolTable &symbolTable) {
   // Rename the original function so that our wrapper can use the original
@@ -277,7 +344,8 @@
 
   // Create the wrapper function that conforms to the IREE native ABI and
   // marshals arguments/results to the original function.
-  auto wrapperOp = createExportWrapperFunc(exportOp, publicName);
+  auto wrapperOp =
+      createExportWrapperFunc(invocationModel, exportOp, publicName);
   if (!wrapperOp) return failure();
   moduleOp.insert(Block::iterator(exportOp), wrapperOp);
 
@@ -290,6 +358,12 @@
 class WrapEntryPointsPass
     : public PassWrapper<WrapEntryPointsPass, OperationPass<ModuleOp>> {
  public:
+  WrapEntryPointsPass() = default;
+  WrapEntryPointsPass(const WrapEntryPointsPass &pass) {}
+  WrapEntryPointsPass(IREE::ABI::InvocationModel invocationModel) {
+    this->invocationModel = invocationModel;
+  }
+
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<func::FuncDialect, mlir::arith::ArithDialect,
                     mlir::tensor::TensorDialect, IREE::HAL::HALDialect>();
@@ -329,7 +403,8 @@
     // This will preserve the internal types (tensors/etc) but change the import
     // to taking the ABI types and rewrite calls.
     for (auto importOp : importOps) {
-      if (failed(wrapImportFunc(moduleOp, importOp, symbolTable))) {
+      if (failed(wrapImportFunc(invocationModel, moduleOp, importOp,
+                                symbolTable))) {
         return signalPassFailure();
       }
     }
@@ -338,15 +413,31 @@
     // This will change the export to taking the ABI types and preserve the
     // internal types.
     for (auto exportOp : exportOps) {
-      if (failed(wrapExportFunc(moduleOp, exportOp, symbolTable))) {
+      if (failed(wrapExportFunc(invocationModel, moduleOp, exportOp,
+                                symbolTable))) {
         return signalPassFailure();
       }
     }
   }
+
+ private:
+  Option<InvocationModel> invocationModel{
+      *this,
+      "invocation-model",
+      llvm::cl::desc("Specifies the execution model used for invocations."),
+      llvm::cl::init(IREE::ABI::InvocationModel::Sync),
+      llvm::cl::values(
+          clEnumValN(IREE::ABI::InvocationModel::Sync, "sync",
+                     "Fully synchronous behavior with no fences."),
+          clEnumValN(IREE::ABI::InvocationModel::CoarseFences, "coarse-fences",
+                     "Exposes one wait fence for all inputs and one signal "
+                     "fence for all outputs.")),
+  };
 };
 
-std::unique_ptr<OperationPass<ModuleOp>> createWrapEntryPointsPass() {
-  return std::make_unique<WrapEntryPointsPass>();
+std::unique_ptr<OperationPass<ModuleOp>> createWrapEntryPointsPass(
+    IREE::ABI::InvocationModel invocationModel) {
+  return std::make_unique<WrapEntryPointsPass>(invocationModel);
 }
 
 static PassRegistration<WrapEntryPointsPass> pass;
diff --git a/compiler/src/iree/compiler/Bindings/Native/Transforms/test/BUILD b/compiler/src/iree/compiler/Bindings/Native/Transforms/test/BUILD
index 022522e..832f393 100644
--- a/compiler/src/iree/compiler/Bindings/Native/Transforms/test/BUILD
+++ b/compiler/src/iree/compiler/Bindings/Native/Transforms/test/BUILD
@@ -17,6 +17,7 @@
     srcs = enforce_glob(
         [
             "wrap_entry_points.mlir",
+            "wrap_entry_points_coarse_fences.mlir",
         ],
         include = ["*.mlir"],
     ),
diff --git a/compiler/src/iree/compiler/Bindings/Native/Transforms/test/CMakeLists.txt b/compiler/src/iree/compiler/Bindings/Native/Transforms/test/CMakeLists.txt
index 206d72f..61f49b8 100644
--- a/compiler/src/iree/compiler/Bindings/Native/Transforms/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Bindings/Native/Transforms/test/CMakeLists.txt
@@ -15,6 +15,7 @@
     lit
   SRCS
     "wrap_entry_points.mlir"
+    "wrap_entry_points_coarse_fences.mlir"
   TOOLS
     FileCheck
     iree-opt
diff --git a/compiler/src/iree/compiler/Bindings/Native/Transforms/test/wrap_entry_points.mlir b/compiler/src/iree/compiler/Bindings/Native/Transforms/test/wrap_entry_points.mlir
index 4aa2cf9..06c496f 100644
--- a/compiler/src/iree/compiler/Bindings/Native/Transforms/test/wrap_entry_points.mlir
+++ b/compiler/src/iree/compiler/Bindings/Native/Transforms/test/wrap_entry_points.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --iree-abi-wrap-entry-points --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline='iree-abi-wrap-entry-points{invocation-model=sync}' --split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: func.func @dynamicEntry(
 //  CHECK-SAME:   %[[ARG0:.+]]: !hal.buffer_view, %[[ARG1:.+]]: !hal.buffer_view
diff --git a/compiler/src/iree/compiler/Bindings/Native/Transforms/test/wrap_entry_points_coarse_fences.mlir b/compiler/src/iree/compiler/Bindings/Native/Transforms/test/wrap_entry_points_coarse_fences.mlir
new file mode 100644
index 0000000..abe9b22
--- /dev/null
+++ b/compiler/src/iree/compiler/Bindings/Native/Transforms/test/wrap_entry_points_coarse_fences.mlir
@@ -0,0 +1,101 @@
+// RUN: iree-opt --pass-pipeline='iree-abi-wrap-entry-points{invocation-model=coarse-fences}' --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: func.func @asyncEntry(
+//  CHECK-SAME:   %[[ARG0:.+]]: !hal.buffer_view, %[[ARG1:.+]]: !hal.buffer_view, %[[WAIT:.+]]: !hal.fence, %[[SIGNAL:.+]]: !hal.fence
+//  CHECK-SAME: -> (
+//  CHECK-SAME:   !hal.buffer_view, !hal.buffer_view
+//  CHECK-SAME: ) attributes {
+//  CHECK-SAME:   iree.abi.stub
+//  CHECK-SAME:   iree.reflection = {iree.abi.model = "coarse-fences"}
+//  CHECK-SAME: } {
+//  CHECK-NEXT:   %[[ARG0_TENSOR:.+]] = hal.tensor.import wait(%[[WAIT]]) => %[[ARG0]] : !hal.buffer_view -> tensor<4xf32>
+//  CHECK-NEXT:   %[[ARG1_TENSOR:.+]] = hal.tensor.import wait(%[[WAIT]]) => %[[ARG1]] : !hal.buffer_view -> tensor<4xf32>
+//  CHECK-NEXT:   %[[RESULT_TENSORS:.+]]:2 = call @_asyncEntry(%[[ARG0_TENSOR]], %[[ARG1_TENSOR]])
+//  CHECK-NEXT:   %[[READY_TENSORS:.+]]:2 = hal.tensor.barrier join(%[[RESULT_TENSORS]]#0, %[[RESULT_TENSORS]]#1 : tensor<4xf32>, tensor<4xf32>) => %[[SIGNAL]] : !hal.fence
+//  CHECK-NEXT:   %[[RET0_VIEW:.+]] = hal.tensor.export %[[READY_TENSORS]]#0 : tensor<4xf32> -> !hal.buffer_view
+//  CHECK-NEXT:   %[[RET1_VIEW:.+]] = hal.tensor.export %[[READY_TENSORS]]#1 : tensor<4xf32> -> !hal.buffer_view
+//  CHECK-NEXT:   return %[[RET0_VIEW]], %[[RET1_VIEW]] : !hal.buffer_view, !hal.buffer_view
+//  CHECK-NEXT: }
+
+// CHECK-LABEL: func.func private @_asyncEntry(
+func.func @asyncEntry(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> (tensor<4xf32>, tensor<4xf32>) {
+  %0 = arith.addf %arg0, %arg1 : tensor<4xf32>
+  %1 = arith.addf %0, %arg0 : tensor<4xf32>
+  return %0, %1 : tensor<4xf32>, tensor<4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @bareFunc
+//  CHECK-SAME: (%[[WAIT:.+]]: !hal.fence, %[[SIGNAL:.+]]: !hal.fence)
+//       CHECK:   call @_bareFunc()
+//  CHECK-NEXT:   hal.fence.signal<%[[SIGNAL]] : !hal.fence>
+//  CHECK-NEXT:   return
+
+// CHECK-LABEL: func.func private @_bareFunc(
+func.func @bareFunc() {
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @primitiveArgOnly
+//  CHECK-SAME: (%[[ARG0:.+]]: i32, %[[WAIT:.+]]: !hal.fence, %[[SIGNAL:.+]]: !hal.fence)
+//  CHECK-NEXT:   call @_primitiveArgOnly(%[[ARG0]])
+//  CHECK-NEXT:   hal.fence.signal<%[[SIGNAL]] : !hal.fence>
+//  CHECK-NEXT:   return
+
+// CHECK-LABEL: func.func private @_primitiveArgOnly(
+func.func @primitiveArgOnly(%arg0: i32) {
+  %0 = arith.addi %arg0, %arg0 : i32
+  util.do_not_optimize(%0) : i32
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @tensorArgOnly
+//  CHECK-SAME: (%[[ARG0:.+]]: !hal.buffer_view, %[[WAIT:.+]]: !hal.fence, %[[SIGNAL:.+]]: !hal.fence)
+//       CHECK:   %[[ARG0_TENSOR:.+]] = hal.tensor.import wait(%[[WAIT]]) => %[[ARG0]] : !hal.buffer_view -> tensor<4xf32>
+//  CHECK-NEXT:   call @_tensorArgOnly(%[[ARG0_TENSOR]])
+//  CHECK-NEXT:   hal.fence.signal<%[[SIGNAL]] : !hal.fence>
+//  CHECK-NEXT:   return
+
+// CHECK-LABEL: func.func private @_tensorArgOnly(
+func.func @tensorArgOnly(%arg0: tensor<4xf32>) {
+  %0 = arith.addf %arg0, %arg0 : tensor<4xf32>
+  util.do_not_optimize(%0) : tensor<4xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @primitiveResultOnly
+//  CHECK-SAME: (%[[WAIT:.+]]: !hal.fence, %[[SIGNAL:.+]]: !hal.fence) -> i32
+//  CHECK-NEXT:   %[[RESULT:.+]] = call @_primitiveResultOnly()
+//  CHECK-NEXT:   hal.fence.signal<%[[SIGNAL]] : !hal.fence>
+//  CHECK-NEXT:   return %[[RESULT]]
+
+// CHECK-LABEL: func.func private @_primitiveResultOnly(
+func.func @primitiveResultOnly() -> i32 {
+  %0 = arith.constant 8 : i32
+  %1 = util.do_not_optimize(%0) : i32
+  return %1 : i32
+}
+
+// -----
+
+// CHECK-LABEL: func.func @tensorResultOnly
+//  CHECK-SAME: (%[[WAIT:.+]]: !hal.fence, %[[SIGNAL:.+]]: !hal.fence) -> !hal.buffer_view
+//  CHECK-NEXT:   %[[RESULT_TENSOR:.+]] = call @_tensorResultOnly()
+//  CHECK-NEXT:   %[[READY_TENSOR:.+]] = hal.tensor.barrier join(%[[RESULT_TENSOR]] : tensor<4xf32>) => %[[SIGNAL]] : !hal.fence
+//  CHECK-NEXT:   %[[RESULT_VIEW:.+]] = hal.tensor.export %[[READY_TENSOR]]
+//  CHECK-NEXT:   return %[[RESULT_VIEW]]
+
+// CHECK-LABEL: func.func private @_tensorResultOnly(
+func.func @tensorResultOnly() -> tensor<4xf32> {
+  %0 = arith.constant dense<[0.0, 1.0, 2.0, 3.0]> : tensor<4xf32>
+  %1 = util.do_not_optimize(%0) : tensor<4xf32>
+  return %1 : tensor<4xf32>
+}
+
diff --git a/compiler/src/iree/compiler/Bindings/TFLite/Transforms/WrapEntryPoints.cpp b/compiler/src/iree/compiler/Bindings/TFLite/Transforms/WrapEntryPoints.cpp
index f797cca..61f5bd1 100644
--- a/compiler/src/iree/compiler/Bindings/TFLite/Transforms/WrapEntryPoints.cpp
+++ b/compiler/src/iree/compiler/Bindings/TFLite/Transforms/WrapEntryPoints.cpp
@@ -228,7 +228,7 @@
       auto dynamicDims = inputDynamicDims.loadDynamicDims(recalculateBuilder);
       auto castOp = recalculateBuilder.create<IREE::HAL::TensorImportOp>(
           loc, inputValue.getType(), inputPlaceholder, inputValue.getType(),
-          dynamicDims);
+          dynamicDims, /*wait_fence=*/Value{});
       inputValue.replaceAllUsesWith(castOp.getTarget());
     }
     while (entryBlock.getNumArguments() > 0) {
@@ -525,7 +525,8 @@
       }
       callOperands.push_back(entryBuilder.create<IREE::HAL::TensorImportOp>(
           arg.getLoc(), inputDynamicDims.tensorType, arg,
-          TypeAttr::get(inputDynamicDims.tensorType), dynamicDims));
+          TypeAttr::get(inputDynamicDims.tensorType), dynamicDims,
+          /*wait_fence=*/Value{}));
     }
     auto callOp = entryBuilder.create<mlir::func::CallOp>(
         entryFuncOp.getLoc(), entryFuncOp, callOperands);
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/ConvertStreamToHAL.cpp b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/ConvertStreamToHAL.cpp
index e622eae..45c3d92 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/ConvertStreamToHAL.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/ConvertStreamToHAL.cpp
@@ -16,6 +16,7 @@
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Dominance.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
@@ -44,6 +45,62 @@
       loc, builder.getType<IREE::HAL::FenceType>());
 }
 
+// Finds a !hal.fence bound to |timepoint| via a chain op and returns it if
+// it is usable at the builder insertion point. The chain ops is only used if
+// it is the only consumer of the timepoint and it is removed upon return.
+static Value consumeBoundFence(Value timepoint,
+                               ConversionPatternRewriter &rewriter) {
+  // Must only have one use. We can't consume a fence multiple times.
+  if (!timepoint.hasOneUse()) return nullptr;  // >1 use
+
+  // The use must be an export to a fence.
+  auto chainOp = dyn_cast<IREE::Stream::TimepointChainExternalOp>(
+      *timepoint.getUsers().begin());
+  if (!chainOp) return nullptr;  // non-export use
+  assert(!chainOp.getExternalValues().empty());
+  auto fence = chainOp.getExternalValues().front();
+  if (!fence || !fence.getType().isa<IREE::HAL::FenceType>()) return nullptr;
+
+  // Try really hard to figure out if the fence can be used. A larger analysis
+  // pass running prior to conversion that did some code motion could help
+  // ensure the fence SSA value is usable in the places it is needed - for now
+  // we just do this local check that satisfies most common programs today. IPO
+  // would do something like add the fence as an argument to function calls so
+  // that the functions could consume it but inlining is pretty aggressive now.
+  if (!IREE::Util::isValueUsableForOp(fence, rewriter.getBlock(),
+                                      rewriter.getInsertionPoint())) {
+    return nullptr;  // unusable
+  }
+
+  // Consume the op by erasing it.
+  rewriter.eraseOp(chainOp);
+
+  return fence;  // usable
+}
+
+// Returns the a new fence for |timepoint| or an existing fence if one was
+// associated with an external fence. Returns util.null if no one observes the
+// fence.
+static Value getOrCreateSignalFence(Location loc, Value device, Value timepoint,
+                                    ConversionPatternRewriter &rewriter) {
+  // Check to see if anyone is consuming the timepoint - if not then we don't
+  // need create a fence.
+  if (timepoint.use_empty()) {
+    return rewriter.create<IREE::Util::NullOp>(
+        loc, rewriter.getType<IREE::HAL::FenceType>());
+  }
+
+  // Check to see if the timepoint is associated with a fence. In common cases
+  // when along ABI boundaries we can usually find an association.
+  auto fence = consumeBoundFence(timepoint, rewriter);
+  if (fence) return fence;
+
+  // Create a new fence.
+  return rewriter.create<IREE::HAL::FenceCreateOp>(
+      loc, rewriter.getType<IREE::HAL::FenceType>(), device,
+      IREE::HAL::FenceFlagBitfield::None);
+}
+
 // Scans all of the stream.cmd.* ops in the region to derive a command category.
 static IREE::HAL::CommandCategoryBitfield deriveCommandCategories(
     Region &region) {
@@ -255,9 +312,8 @@
     // Gather wait/signal fence, which are optional.
     Value waitFence =
         getOrCreateWaitFence(loc, adaptor.getAwaitTimepoint(), rewriter);
-    Value signalFence = rewriter.create<IREE::HAL::FenceCreateOp>(
-        loc, rewriter.getType<IREE::HAL::FenceType>(), device,
-        IREE::HAL::FenceFlagBitfield::None);
+    Value signalFence = getOrCreateSignalFence(
+        loc, device, allocaOp.getResultTimepoint(), rewriter);
 
     // Queue allocation.
     auto queueAffinity = rewriter.create<arith::ConstantIntOp>(loc, -1, 64);
@@ -283,9 +339,8 @@
     // Gather wait/signal fence, which are optional.
     Value waitFence =
         getOrCreateWaitFence(loc, adaptor.getAwaitTimepoint(), rewriter);
-    Value signalFence = rewriter.create<IREE::HAL::FenceCreateOp>(
-        loc, rewriter.getType<IREE::HAL::FenceType>(), device,
-        IREE::HAL::FenceFlagBitfield::None);
+    Value signalFence = getOrCreateSignalFence(
+        loc, device, deallocaOp.getResultTimepoint(), rewriter);
 
     // Queue allocation.
     auto queueAffinity = rewriter.create<arith::ConstantIntOp>(loc, -1, 64);
@@ -859,9 +914,8 @@
     // Gather wait/signal fence, which are optional.
     Value waitFence =
         getOrCreateWaitFence(loc, adaptor.getAwaitTimepoint(), rewriter);
-    Value signalFence = rewriter.create<IREE::HAL::FenceCreateOp>(
-        loc, rewriter.getType<IREE::HAL::FenceType>(), device,
-        IREE::HAL::FenceFlagBitfield::None);
+    Value signalFence = getOrCreateSignalFence(
+        loc, device, executeOp.getResultTimepoint(), rewriter);
 
     // Queue execution.
     auto queueAffinity = rewriter.create<arith::ConstantIntOp>(loc, -1, 64);
@@ -957,6 +1011,30 @@
   }
 };
 
+struct TimepointChainExternalOpPattern
+    : public StreamConversionPattern<IREE::Stream::TimepointChainExternalOp> {
+  using StreamConversionPattern::StreamConversionPattern;
+  LogicalResult matchAndRewrite(
+      IREE::Stream::TimepointChainExternalOp exportOp, OpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    // Only handle exports into HAL fences.
+    auto externalValues = exportOp.getExternalValues();
+    if (externalValues.size() != 1 ||
+        !externalValues[0].getType().isa<IREE::HAL::FenceType>()) {
+      return rewriter.notifyMatchFailure(
+          exportOp, "only exports to HAL fences are supported");
+    }
+    auto device = lookupDeviceFor(exportOp, rewriter);
+    auto queueAffinity =
+        rewriter.create<arith::ConstantIntOp>(exportOp.getLoc(), -1, 64);
+    rewriter.replaceOpWithNewOp<IREE::HAL::DeviceQueueExecuteOp>(
+        exportOp, device, queueAffinity,
+        /*wait_fence=*/adaptor.getAwaitTimepoint(),
+        /*signal_fence=*/externalValues[0], /*command_buffers=*/ValueRange{});
+    return success();
+  }
+};
+
 struct TimepointJoinOpPattern
     : public StreamConversionPattern<IREE::Stream::TimepointJoinOp> {
   using StreamConversionPattern::StreamConversionPattern;
@@ -970,6 +1048,23 @@
   }
 };
 
+struct TimepointBarrierOpPattern
+    : public StreamConversionPattern<IREE::Stream::TimepointBarrierOp> {
+  using StreamConversionPattern::StreamConversionPattern;
+  LogicalResult matchAndRewrite(
+      IREE::Stream::TimepointBarrierOp barrierOp, OpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    // Replace with a signaled fence.
+    // NOTE: this assumes that if this op still exists the input resource is
+    // already available. If it isn't then timepoint propagation should have
+    // replaced the signal op with the producing timepoint.
+    Value nullFence = rewriter.create<IREE::Util::NullOp>(
+        barrierOp.getLoc(), rewriter.getType<IREE::HAL::FenceType>());
+    rewriter.replaceOp(barrierOp, {adaptor.getResource(), nullFence});
+    return success();
+  }
+};
+
 struct TimepointAwaitOpPattern
     : public StreamConversionPattern<IREE::Stream::TimepointAwaitOp> {
   using StreamConversionPattern::StreamConversionPattern;
@@ -1057,7 +1152,8 @@
               CmdExecuteOpPattern, CmdSerialOpPattern, CmdConcurrentOpPattern>(
           mapping, typeConverter, context);
   patterns.insert<TimepointImmediateOpPattern, TimepointImportOpPattern,
-                  TimepointExportOpPattern, TimepointJoinOpPattern,
+                  TimepointExportOpPattern, TimepointChainExternalOpPattern,
+                  TimepointJoinOpPattern, TimepointBarrierOpPattern,
                   TimepointAwaitOpPattern>(mapping, typeConverter, context);
   patterns.insert<ElideYieldOpPattern>(mapping, typeConverter, context);
 }
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/timepoint_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/timepoint_ops.mlir
index d5683f6..7d71e1d 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/timepoint_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/timepoint_ops.mlir
@@ -42,6 +42,17 @@
 
 // -----
 
+// CHECK-LABEL: @timepointChainExternal
+//  CHECK-SAME: (%[[TIMEPOINT:.+]]: !hal.fence, %[[SIGNAL:.+]]: !hal.fence)
+func.func @timepointChainExternal(%timepoint: !stream.timepoint, %signal: !hal.fence) {
+  // CHECK: %[[DEVICE:.+]] = hal.ex.shared_device
+  // CHECK: hal.device.queue.execute<%[[DEVICE]] : !hal.device> affinity(%c-1_i64) wait(%[[TIMEPOINT]]) signal(%[[SIGNAL]])
+  stream.timepoint.chain_external %timepoint => (%signal : !hal.fence)
+  return
+}
+
+// -----
+
 // CHECK-LABEL: @timepointJoin
 func.func @timepointJoin(%arg0: !stream.timepoint, %arg1: !stream.timepoint) -> !stream.timepoint {
   // CHECK: %[[FENCE:.+]] = hal.fence.join at([%arg0, %arg1]) -> !hal.fence
@@ -52,6 +63,18 @@
 
 // -----
 
+// CHECK-LABEL: @timepointBarrier
+//  CHECK-SAME: (%[[R0:.+]]: !hal.buffer) -> (!hal.buffer, !hal.fence)
+func.func @timepointBarrier(%r0: !stream.resource<external>) -> (!stream.resource<external>, !stream.timepoint) {
+  %c128 = arith.constant 128 : index
+  // CHECK: %[[R1T:.+]] = util.null : !hal.fence
+  %r1, %r1t = stream.timepoint.barrier %r0 : !stream.resource<external>{%c128} => !stream.timepoint
+  // CHECK: return %[[R0]], %[[R1T]]
+  return %r1, %r1t : !stream.resource<external>, !stream.timepoint
+}
+
+// -----
+
 // CHECK-LABEL: @timepointAwait
 func.func @timepointAwait(%arg0: !stream.timepoint, %arg1: !stream.resource<staging>, %arg2: !stream.resource<*>) -> (!stream.resource<staging>, !stream.resource<*>) {
   %c100 = arith.constant 100 : index
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOpFolders.cpp b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOpFolders.cpp
index 4a95e6d..93f0ac5 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOpFolders.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOpFolders.cpp
@@ -52,6 +52,47 @@
 }
 
 //===----------------------------------------------------------------------===//
+// hal.tensor.barrier
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Deduplicates hal.tensor.barrier operands.
+struct DeduplicateTensorBarrierSources
+    : public OpRewritePattern<TensorBarrierOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(TensorBarrierOp op,
+                                PatternRewriter &rewriter) const override {
+    DenseMap<Value, unsigned> uniqueSources;  // source -> unique index
+    SmallVector<Value> orderedSources;
+    SmallVector<unsigned> resultMapping;  // old -> new result index
+    for (auto source : op.getSources()) {
+      auto it =
+          uniqueSources.insert(std::make_pair(source, orderedSources.size()));
+      if (it.second) orderedSources.push_back(source);
+      resultMapping.push_back(it.first->second);
+    }
+    if (orderedSources.size() == op.getSources().size()) return failure();
+    auto newOp = rewriter.create<TensorBarrierOp>(op.getLoc(), orderedSources,
+                                                  op.getSignalFence());
+    SmallVector<Value> newResults;
+    newResults.reserve(newOp.getNumResults());
+    for (unsigned newIndex : resultMapping) {
+      newResults.push_back(newOp.getResult(newIndex));
+    }
+    rewriter.replaceOp(op, newResults);
+    return success();
+  }
+};
+
+}  // namespace
+
+void TensorBarrierOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                                  MLIRContext *context) {
+  results.insert<DeduplicateTensorBarrierSources>(context);
+}
+
+//===----------------------------------------------------------------------===//
 // hal.buffer_view.*
 //===----------------------------------------------------------------------===//
 
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp
index 1c0429f..88f4e1e 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp
@@ -164,6 +164,11 @@
 
 void TensorImportOp::build(OpBuilder &builder, OperationState &result,
                            Type resultType, Value source) {
+  build(builder, result, resultType, source, /*waitFence=*/Value{});
+}
+
+void TensorImportOp::build(OpBuilder &builder, OperationState &result,
+                           Type resultType, Value source, Value waitFence) {
   auto shapedType = resultType.cast<ShapedType>();
   assert((source.getType().isa<IREE::HAL::BufferViewType>() ||
           shapedType.hasStaticShape()) &&
@@ -177,7 +182,7 @@
         builder.getIndexAttr(i)));
   }
   build(builder, result, resultType, source, TypeAttr::get(shapedType),
-        dynamicDims);
+        dynamicDims, waitFence);
 }
 
 Value TensorImportOp::getTiedResult(unsigned resultIndex) {
@@ -282,6 +287,32 @@
 }
 
 //===----------------------------------------------------------------------===//
+// hal.tensor.barrier
+//===----------------------------------------------------------------------===//
+
+void TensorBarrierOp::build(OpBuilder &builder, OperationState &result,
+                            ValueRange sources, Value signalFence) {
+  auto resultTypes = llvm::to_vector(
+      llvm::map_range(sources, [](Value source) { return source.getType(); }));
+  build(builder, result, resultTypes, sources, signalFence);
+}
+
+Value TensorBarrierOp::getTiedResult(unsigned resultIndex) {
+  return IREE::Util::TiedOpInterface::findTiedBaseValue(
+      getSources()[resultIndex]);
+}
+
+::llvm::Optional<unsigned> TensorBarrierOp::getTiedResultOperandIndex(
+    unsigned resultIndex) {
+  return {resultIndex};  // sources[i]
+}
+
+SmallVector<int64_t, 4> TensorBarrierOp::getTiedResultOperandIndices() {
+  size_t numSources = getSources().size();
+  return llvm::to_vector<4>(llvm::seq<int64_t>(0, numSources));
+}
+
+//===----------------------------------------------------------------------===//
 // hal.allocator.allocate
 //===----------------------------------------------------------------------===//
 
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
index d4db747..ada64fd 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
@@ -53,6 +53,7 @@
 //===----------------------------------------------------------------------===//
 
 def HAL_TensorImportOp : HAL_PureOp<"tensor.import", [
+  AttrSizedOperandSegments,
   DeclareOpInterfaceMethods<Util_TiedOpInterface, [
     "getTiedResult",
     "getTiedResultOperandIndex",
@@ -78,13 +79,15 @@
   let arguments = (ins
     AnyTypeOf<[HAL_Buffer, HAL_BufferView]>:$source,
     TypeAttr:$target_encoding,
-    HAL_ShapeDynamicDims:$target_dims
+    HAL_ShapeDynamicDims:$target_dims,
+    Optional<HAL_Fence>:$wait_fence
   );
   let results = (outs
     AnyTensor:$target
   );
 
   let assemblyFormat = [{
+    (`wait` `(` $wait_fence^ `)` `=` `` `>`)?
     $source `:` type($source)
     `->`
     custom<TypeAlias>($target_encoding, type($target)) (`{` $target_dims^ `}`)?
@@ -96,6 +99,11 @@
       "Type":$resultType,
       "Value":$source
     )>,
+    OpBuilder<(ins
+      "Type":$resultType,
+      "Value":$source,
+      "Value":$waitFence
+    )>,
   ];
 
   let extraClassDeclaration = [{
@@ -120,9 +128,6 @@
   let summary = [{exports a tensor to a HAL buffer view}];
   let description = [{
     Defines an export of an SSA-form tensor to an external HAL buffer view.
-    An optional semaphore timepoint can be specified indicating when the
-    buffer view is available for use. If no semaphore timepoint is requested it
-    is assumed execution blocks until the buffer view is available.
 
     The provided `source_encoding`, if different from the `source` type,
     indicates that the ABI-facing type may differ from the internal
@@ -174,6 +179,45 @@
   let hasFolder = 1;
 }
 
+def HAL_TensorBarrierOp : HAL_Op<"tensor.barrier", [
+  AllTypesMatch<["sources", "results"]>,
+  DeclareOpInterfaceMethods<Util_TiedOpInterface, [
+    "getTiedResult",
+    "getTiedResultOperandIndex",
+    "getTiedResultOperandIndices",
+  ]>,
+]> {
+  let summary = [{signals a fence when all tensors are available}];
+  let description = [{
+    Defines a barrier that is used to indicate availability of an entire set of
+    tensors by signaling a fence. The source tensors are returned for chaining.
+  }];
+
+  let arguments = (ins
+    Variadic<AnyTensor>:$sources,
+    HAL_Fence:$signal_fence
+  );
+  let results = (outs
+    Variadic<AnyTensor>:$results
+  );
+
+  let assemblyFormat = [{
+    `join` `` `(` $sources `:` type($sources) `)`
+    `=` `` `>`
+    $signal_fence `:` type($signal_fence)
+    attr-dict-with-keyword
+  }];
+
+  let builders = [
+    OpBuilder<(ins
+      "ValueRange":$sources,
+      "Value":$signalFence
+    )>,
+  ];
+
+  let hasCanonicalizer = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // !hal.allocator / iree_hal_allocator_t
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/tensor_op_folding.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/tensor_op_folding.mlir
index 4835459..f7f295d 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/tensor_op_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/tensor_op_folding.mlir
@@ -37,3 +37,14 @@
   // CHECK: return %arg0 : tensor<5xi32>
   return %1 : tensor<5xi32>
 }
+
+// -----
+
+// CHECK-LABEL: @DeduplicateTensorBarrierSources
+// CHECK-SAME: (%[[ARG0:.+]]: tensor<5xi32>, %[[ARG1:.+]]: tensor<6xi32>, %[[FENCE:.+]]: !hal.fence)
+func.func @DeduplicateTensorBarrierSources(%arg0: tensor<5xi32>, %arg1: tensor<6xi32>, %fence: !hal.fence) -> (tensor<5xi32>, tensor<6xi32>, tensor<5xi32>) {
+  // CHECK: %[[RESULTS:.+]]:2 = hal.tensor.barrier join(%[[ARG0]], %[[ARG1]] : tensor<5xi32>, tensor<6xi32>) => %[[FENCE]] : !hal.fence
+  %0:3 = hal.tensor.barrier join(%arg0, %arg1, %arg0 : tensor<5xi32>, tensor<6xi32>, tensor<5xi32>) => %fence : !hal.fence
+  // CHECK: return %[[RESULTS]]#0, %[[RESULTS]]#1, %[[RESULTS]]#0 : tensor<5xi32>, tensor<6xi32>, tensor<5xi32>
+  return %0#0, %0#1, %0#2 : tensor<5xi32>, tensor<6xi32>, tensor<5xi32>
+}
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/tensor_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/tensor_ops.mlir
index d770e6c..a78ab65 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/tensor_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/tensor_ops.mlir
@@ -10,7 +10,7 @@
 // -----
 
 // CHECK-LABEL: @tensorImportDynamic
-func.func @tensorImportDynamic(%arg0: !hal.buffer_view, %arg1 : index) -> tensor<?x3xi32> {
+func.func @tensorImportDynamic(%arg0: !hal.buffer_view, %arg1: index) -> tensor<?x3xi32> {
   // CHECK: hal.tensor.import %arg0 : !hal.buffer_view -> tensor<?x3xf32> as tensor<?x3xi32>{%arg1}
   %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<?x3xf32> as tensor<?x3xi32>{%arg1}
   return %0 : tensor<?x3xi32>
@@ -18,8 +18,17 @@
 
 // -----
 
+// CHECK-LABEL: @tensorImportAsync
+func.func @tensorImportAsync(%arg0: !hal.buffer_view, %arg1: !hal.fence) -> tensor<5xi32> {
+  // CHECK: hal.tensor.import wait(%arg1) => %arg0 : !hal.buffer_view -> tensor<5xi32>
+  %0 = hal.tensor.import wait(%arg1) => %arg0 : !hal.buffer_view -> tensor<5xi32>
+  return %0 : tensor<5xi32>
+}
+
+// -----
+
 // CHECK-LABEL: @tensorExportDynamic
-func.func @tensorExportDynamic(%arg0: tensor<?x3xi32>, %arg1 : index) -> !hal.buffer_view {
+func.func @tensorExportDynamic(%arg0: tensor<?x3xi32>, %arg1: index) -> !hal.buffer_view {
   // CHECK: hal.tensor.export %arg0 : tensor<?x3xf32> as tensor<?x3xi32>{%arg1} -> !hal.buffer_view
   %0 = hal.tensor.export %arg0 : tensor<?x3xf32> as tensor<?x3xi32>{%arg1} -> !hal.buffer_view
   return %0 : !hal.buffer_view
@@ -28,8 +37,17 @@
 // -----
 
 // CHECK-LABEL: @tensorExportInPlace
-func.func @tensorExportInPlace(%arg0: tensor<?x3xi32>, %arg1 : index, %arg2: !hal.buffer) -> !hal.buffer_view {
+func.func @tensorExportInPlace(%arg0: tensor<?x3xi32>, %arg1: index, %arg2: !hal.buffer) -> !hal.buffer_view {
   // CHECK: hal.tensor.export %arg0 into %arg2 : tensor<?x3xf32> as tensor<?x3xi32>{%arg1} -> !hal.buffer_view
   %0 = hal.tensor.export %arg0 into %arg2 : tensor<?x3xf32> as tensor<?x3xi32>{%arg1} -> !hal.buffer_view
   return %0 : !hal.buffer_view
 }
+
+// -----
+
+// CHECK-LABEL: @tensorBarrier
+func.func @tensorBarrier(%arg0: tensor<3xf32>, %arg1: tensor<4xf32>, %arg2: !hal.fence) -> (tensor<3xf32>, tensor<4xf32>) {
+  // CHECK: :2 = hal.tensor.barrier join(%arg0, %arg1 : tensor<3xf32>, tensor<4xf32>) => %arg2 : !hal.fence
+  %0:2 = hal.tensor.barrier join(%arg0, %arg1 : tensor<3xf32>, tensor<4xf32>) => %arg2 : !hal.fence
+  return %0#0, %0#1 : tensor<3xf32>, tensor<4xf32>
+}
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/HALToStream/ConvertHALToStream.cpp b/compiler/src/iree/compiler/Dialect/Stream/Conversion/HALToStream/ConvertHALToStream.cpp
index bc38df1..952f202 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/HALToStream/ConvertHALToStream.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/HALToStream/ConvertHALToStream.cpp
@@ -60,14 +60,28 @@
         op.getLoc(), rewriter.getIndexType(),
         TypeAttr::get(op.getTarget().getType()), adaptor.getTargetDims(),
         /*affinity=*/nullptr);
-    auto newOp = rewriter.create<IREE::Stream::TensorImportOp>(
+    Value resource = rewriter.create<IREE::Stream::TensorImportOp>(
         op.getLoc(), resultType, adaptor.getSource(), TypeAttr::get(targetType),
         adaptor.getTargetDims(), resultSize,
         /*affinity=*/nullptr);
 
+    // Await the fence, if needed. When not specified the resource is assumed to
+    // be immediately available.
+    if (auto waitFence = op.getWaitFence()) {
+      Value waitTimepoint = rewriter.create<IREE::Stream::TimepointImportOp>(
+          op.getLoc(), rewriter.getType<IREE::Stream::TimepointType>(),
+          ValueRange{waitFence},
+          /*affinity=*/nullptr);
+      resource = rewriter
+                     .create<IREE::Stream::TimepointAwaitOp>(
+                         op.getLoc(), ValueRange{resource},
+                         ValueRange{resultSize}, waitTimepoint)
+                     .getResult(0);
+    }
+
     auto unknownType = rewriter.getType<IREE::Stream::ResourceType>();
     rewriter.replaceOpWithNewOp<IREE::Stream::AsyncTransferOp>(
-        op, unknownType, newOp.getResult(), resultSize, resultSize,
+        op, unknownType, resource, resultSize, resultSize,
         /*source_affinity=*/nullptr,
         /*result_affinity=*/nullptr);
     return success();
@@ -196,6 +210,43 @@
   }
 };
 
+// %r0b, %r1b = hal.tensor.barrier join(%r0a : tensor<4xf32>,
+//                                      %r1a : tensor<1xi32>) => %fence
+// ->
+// %r0b, %t0 = stream.timepoint.barrier %r0a :
+//                 tensor<4xf32> in !stream.resource<*> => !stream.timepoint
+// %r1b, %t1 = stream.timepoint.barrier %r1a :
+//                 tensor<1xi32> in !stream.resource<*> => !stream.timepoint
+// %t01 = stream.timepoint.join max(%t0, %t1)
+// stream.timepoint.export %t01 => %fence
+struct ConvertTensorBarrierOp
+    : public OpConversionPattern<IREE::HAL::TensorBarrierOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult matchAndRewrite(
+      IREE::HAL::TensorBarrierOp op, OpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    auto timepointType = rewriter.getType<IREE::Stream::TimepointType>();
+    SmallVector<Value> signaledResources;
+    SmallVector<Value> signaledTimepoints;
+    for (auto sourceResource : adaptor.getSources()) {
+      auto source = consumeTensorOperand(op.getLoc(), sourceResource, rewriter);
+      auto barrierOp = rewriter.create<IREE::Stream::TimepointBarrierOp>(
+          sourceResource.getLoc(), source.resource.getType(), timepointType,
+          source.resource, source.resourceSize, /*affinity=*/nullptr);
+      signaledResources.push_back(barrierOp.getResult());
+      signaledTimepoints.push_back(barrierOp.getResultTimepoint());
+    }
+    Value joinedTimepoint =
+        rewriter.createOrFold<IREE::Stream::TimepointJoinOp>(
+            op.getLoc(), timepointType, signaledTimepoints);
+    rewriter.create<IREE::Stream::TimepointChainExternalOp>(
+        op.getLoc(), joinedTimepoint, ValueRange{adaptor.getSignalFence()},
+        /*affinity=*/nullptr);
+    rewriter.replaceOp(op, signaledResources);
+    return success();
+  }
+};
+
 }  // namespace
 
 void populateHALToStreamConversionPatterns(MLIRContext *context,
@@ -205,6 +256,7 @@
       [](IREE::HAL::BufferViewType type) { return type; });
   patterns.insert<ConvertTensorImportOp>(typeConverter, context);
   patterns.insert<ConvertTensorExportOp>(typeConverter, context);
+  patterns.insert<ConvertTensorBarrierOp>(typeConverter, context);
 }
 
 void populateHALToStreamConversionPatterns(MLIRContext *context,
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/HALToStream/test/abi_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/Conversion/HALToStream/test/abi_ops.mlir
index 934b2a6..2c1f4e2 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/HALToStream/test/abi_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/HALToStream/test/abi_ops.mlir
@@ -35,6 +35,25 @@
 
 // -----
 
+// CHECK-LABEL: @importBufferViewAsync
+// CHECK-SAME: (%[[VIEW:.+]]: !hal.buffer_view, %[[FENCE:.+]]: !hal.fence)
+// CHECK-SAME: -> (!stream.resource<*>, index)
+func.func @importBufferViewAsync(%view: !hal.buffer_view, %fence: !hal.fence) -> tensor<4xf32> {
+  //  CHECK-DAG: %[[SIZE:.+]] = stream.tensor.sizeof tensor<4xf32>
+  //      CHECK: %[[ASYNC_RESOURCE:.+]] = stream.tensor.import %[[VIEW]]
+  // CHECK-SAME:     : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%[[SIZE]]}
+  //      CHECK: %[[TIMEPOINT:.+]] = stream.timepoint.import %[[FENCE]]
+  //      CHECK: %[[SYNC_RESOURCE:.+]] = stream.timepoint.await %[[TIMEPOINT]] => %[[ASYNC_RESOURCE]]
+  // CHECK-SAME:     : !stream.resource<external>{%[[SIZE]]}
+  // CHECK-NEXT: %[[RESULT:.+]] = stream.async.transfer %[[SYNC_RESOURCE]]
+  // CHECK-SAME:     : !stream.resource<external>{%[[SIZE]]} -> !stream.resource<*>{%[[SIZE]]}
+  %0 = hal.tensor.import wait(%fence) => %view : !hal.buffer_view -> tensor<4xf32>
+  // CHECK: return %[[RESULT]], %[[SIZE]] : !stream.resource<*>, index
+  return %0 : tensor<4xf32>
+}
+
+// -----
+
 // CHECK-LABEL: @exportBufferView
 // CHECK-SAME: (%[[TENSOR:.+]]: !stream.resource<*>, %[[SIZE:.+]]: index, %[[DIM0:.+]]: index, %[[DIM1:.+]]: index)
 func.func @exportBufferView(%tensor: tensor<?x?x4xf32>, %dim0: index, %dim1: index) -> !hal.buffer_view {
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOpFolders.cpp b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOpFolders.cpp
index 95567f1..847d35f 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOpFolders.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOpFolders.cpp
@@ -2341,6 +2341,75 @@
 }
 
 //===----------------------------------------------------------------------===//
+// stream.timepoint.barrier
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+// Walks up the tied op SSA def chain to find a stream.timepoint.await op that
+// produces the resource. Returns nullptr if no await op is found or local
+// analysis cannot determine the source (spans across a branch, etc).
+static std::pair<IREE::Stream::TimepointAwaitOp, Value> findSourceAwaitOp(
+    Value resource) {
+  Value baseResource = resource;
+  while (auto definingOp = dyn_cast_or_null<IREE::Util::TiedOpInterface>(
+             baseResource.getDefiningOp())) {
+    if (auto awaitOp = dyn_cast<IREE::Stream::TimepointAwaitOp>(
+            baseResource.getDefiningOp())) {
+      return {awaitOp, baseResource};
+    }
+    auto tiedValue = definingOp.getTiedResultOperand(baseResource);
+    if (!tiedValue) break;
+    baseResource = tiedValue;
+  }
+  return {nullptr, nullptr};
+}
+
+// Tries to find awaits that feed into signals and then chains execution by
+// propagating the original timepoint forward.
+//
+// Example:
+//  %r0a = stream.timepoint.await %t0 => %source
+//  %r0b, %t1 = stream.timepoint.barrier %r0a
+// ->
+//  %r0b = %source
+//  %t1 = %t0
+struct ChainTimepoints : public OpRewritePattern<TimepointBarrierOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(TimepointBarrierOp barrierOp,
+                                PatternRewriter &rewriter) const override {
+    // Try to find an await op. This may traverse through any number of tied ops
+    // along the way.
+    auto [awaitOp, baseResource] = findSourceAwaitOp(barrierOp.getResource());
+    if (!awaitOp) return failure();
+
+    // TODO(benvanik): move this to a pass that can do IPO. Local analysis is
+    // insufficient for this. For now we conservatively ignore any case where
+    // the await does not feed directly into the signal.
+    if (baseResource != barrierOp.getResource()) {
+      return rewriter.notifyMatchFailure(
+          barrierOp, "ops exist between await and signal, not yet matching");
+    }
+
+    // Rewrite such that consumers of the signal op wait on the prior
+    // timepoint.
+    rewriter.replaceOp(barrierOp,
+                       {
+                           awaitOp.getTiedResultOperand(baseResource),
+                           awaitOp.getAwaitTimepoint(),
+                       });
+    return success();
+  }
+};
+
+}  // namespace
+
+void TimepointBarrierOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                                     MLIRContext *context) {
+  results.insert<ChainTimepoints>(context);
+}
+
+//===----------------------------------------------------------------------===//
 // stream.timepoint.await
 //===----------------------------------------------------------------------===//
 
@@ -2461,6 +2530,16 @@
   }
 };
 
+// Returns true if all operands of |op| are defined before |insertionPoint| in
+// the containing block.
+static bool areAllOperandsDefinedBy(Operation *op, Operation *insertionPoint,
+                                    DominanceInfo &dominanceInfo) {
+  for (auto operand : op->getOperands()) {
+    if (!dominanceInfo.dominates(operand, insertionPoint)) return false;
+  }
+  return true;
+}
+
 // Finds timepoint awaits on the same timepoint within the same domination
 // paths and groups them together.
 //
@@ -2478,17 +2557,14 @@
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(TimepointAwaitOp op,
                                 PatternRewriter &rewriter) const override {
+    DominanceInfo dominanceInfo(op->getParentOp());
     SmallVector<TimepointAwaitOp> coveredOps;
     for (auto &use : op.getAwaitTimepoint().getUses()) {
       // TODO(benvanik): make this handle joins/ties; today we get blocked
       // there. We rely on other canonicalizers to sink things such that
       // (hopefully) we get them directly accessible here.
       if (use.getOwner() == op) continue;
-      if (use.getOwner()->getBlock() != op->getBlock() ||
-          use.getOwner()->isBeforeInBlock(op)) {
-        // TODO(benvanik): allow dominated blocks.
-        continue;
-      }
+      if (dominanceInfo.dominates(use.getOwner(), op)) continue;
       auto awaitOp = dyn_cast<TimepointAwaitOp>(use.getOwner());
       if (!awaitOp ||
           !AffinityAttr::areCompatible(
@@ -2499,6 +2575,11 @@
         // TODO(benvanik): remove affinity from stream.timepoint.await.
         continue;
       }
+      // Ensure all dependencies of the await op are available.
+      if (!areAllOperandsDefinedBy(awaitOp, op, dominanceInfo)) {
+        // One or more operands is defined after op so we can't merge.
+        continue;
+      }
       coveredOps.push_back(awaitOp);
     }
     if (coveredOps.empty()) return failure();
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
index dc5a5d0..5500579 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
@@ -2096,6 +2096,36 @@
 }
 
 //===----------------------------------------------------------------------===//
+// stream.timepoint.barrier
+//===----------------------------------------------------------------------===//
+
+LogicalResult TimepointBarrierOp::verify() {
+  TimepointBarrierOp op = *this;
+  if (failed(verifyOpValueSizes(op, op.getResource(), op.getResourceSize()))) {
+    return failure();
+  }
+  return success();
+}
+
+Value TimepointBarrierOp::getTiedResult(unsigned resultIndex) {
+  return IREE::Util::TiedOpInterface::findTiedBaseValue(getResource());
+}
+
+::llvm::Optional<unsigned> TimepointBarrierOp::getTiedResultOperandIndex(
+    unsigned resultIndex) {
+  return {0};
+}
+
+SmallVector<int64_t, 4> TimepointBarrierOp::getTiedResultOperandIndices() {
+  return {0};
+}
+
+std::pair<unsigned, unsigned>
+TimepointBarrierOp::getTiedResultsIndexAndLength() {
+  return {0, 1};
+}
+
+//===----------------------------------------------------------------------===//
 // stream.timepoint.await
 //===----------------------------------------------------------------------===//
 
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td
index 59b1aa6..7708f41 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td
@@ -2632,7 +2632,7 @@
 ]> {
   let summary = [{exports a timepoint to an external dialect type}];
   let description = [{
-    Defines a conversion to an external dialect type such as `hal.semaphore`
+    Defines a conversion to an external dialect type such as `hal.fence`
     that is resolved during lowering into the stream dialect. This can be used
     to interoperate between levels of the stack that require specifying stream
     types and those that prior to lowering do not handle them.
@@ -2657,6 +2657,33 @@
   let hasFolder = 1;
 }
 
+def Stream_TimepointChainExternalOp :
+    Stream_Op<"timepoint.chain_external", [
+      Stream_AffinityOp,
+    ]> {
+  let summary = [{exports a timepoint to an external dialect type}];
+  let description = [{
+    Defines a conversion to an external dialect type such as `hal.fence`
+    that is resolved during lowering into the stream dialect. This can be used
+    to interoperate between levels of the stack that require specifying stream
+    types and those that prior to lowering do not handle them.
+  }];
+
+  let arguments = (ins
+    Stream_Timepoint:$await_timepoint,
+    Variadic<AnyType>:$external_values,
+    OptionalAttr<Stream_AffinityAttr>:$affinity
+  );
+
+  let assemblyFormat = [{
+    (`on` `(` $affinity^ `)`)?
+    $await_timepoint
+    `=` `` `>`
+    `(` $external_values `:` type($external_values) `)`
+    attr-dict-with-keyword
+  }];
+}
+
 def Stream_TimepointJoinOp : Stream_PureOp<"timepoint.join", [
   Stream_TimelineOp,
 ]> {
@@ -2684,6 +2711,63 @@
   let hasFolder = 1;
 }
 
+def Stream_TimepointBarrierOp : Stream_PureOp<"timepoint.barrier", [
+  AllTypesMatch<["resource", "result"]>,
+  Stream_AffinityOp,
+  Stream_TimelineOp,
+  Util_SizeAwareOp,
+  DeclareOpInterfaceMethods<Util_TiedOpInterface, [
+    "getTiedResult",
+    "getTiedResultOperandIndex",
+    "getTiedResultOperandIndices",
+    "getTiedResultsIndexAndLength",
+  ]>,
+]> {
+  let summary = [{returns a timepoint indicating when a resource is available}];
+  let description = [{
+    After asynchronous execution scheduling resources may exist in different
+    states at different points in the execution timeline. This op enables
+    identifying when the version of a resource after a particular point in the
+    timeline is available. As timepoints transitively chain the timepoint must
+    only cover the resource availability but not be limited to its original
+    production timepoint.
+  }];
+
+  let arguments = (ins
+    AnyTypeOf<[
+      Stream_AnyStreamResource,
+      Stream_StagingResource,
+    ]>:$resource,
+    Stream_Size:$resource_size,
+    OptionalAttr<Stream_AffinityAttr>:$affinity
+  );
+  let results = (outs
+    AnyTypeOf<[
+      Stream_AnyStreamResource,
+      Stream_StagingResource,
+    ]>:$result,
+    Stream_Timepoint:$result_timepoint
+  );
+
+  let assemblyFormat = [{
+    (`on` `(` $affinity^ `)`)?
+    $resource `:` type($resource) `` `{` $resource_size `}`
+    `=` `` `>`
+    type($result_timepoint)
+    attr-dict-with-keyword
+  }];
+
+  let extraClassDeclaration = [{
+    Value getOperandSize(unsigned idx) { return getResourceSize(); }
+    Value getResultSize(unsigned idx) { return getResourceSize(); }
+    SmallVector<Value> getAwaitTimepoints() { return {}; }
+  }];
+
+  let hasVerifier = 1;
+
+  let hasCanonicalizer = 1;
+}
+
 def Stream_TimepointAwaitOp : Stream_PureOp<"timepoint.await", [
   AttrSizedOperandSegments,
   Stream_AffinityOp,
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/timepoint_folding.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/timepoint_folding.mlir
index f7d12c9..146ae41 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/timepoint_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/timepoint_folding.mlir
@@ -77,8 +77,23 @@
 
 // -----
 
-// CHECK-LABEL: @ElideImmediateAwaits
-func.func @ElideImmediateAwaits(%arg0: !stream.resource<staging>) -> !stream.resource<staging> {
+// CHECK-LABEL: @ChainTimepoints
+// CHECK-SAME: (%[[FENCE:.+]]: !stream.timepoint, %[[SOURCE:.+]]: !stream.resource<external>)
+func.func @ChainTimepoints(%fence: !stream.timepoint, %source: !stream.resource<external>) -> (!stream.resource<external>, !stream.timepoint) {
+  %c0 = arith.constant 0 : index
+  %c128 = arith.constant 128 : index
+  // CHECK-NOT: stream.timepoint.await
+  %r0 = stream.timepoint.await %fence => %source : !stream.resource<external>{%c128}
+  // CHECK-NOT: stream.timepoint.barrier
+  %r1, %r1t = stream.timepoint.barrier %r0 : !stream.resource<external>{%c128} => !stream.timepoint
+  // CHECK: return %[[SOURCE]], %[[FENCE]]
+  return %r1, %r1t : !stream.resource<external>, !stream.timepoint
+}
+
+// -----
+
+// CHECK-LABEL: @ElideImmediateHostAwaits
+func.func @ElideImmediateHostAwaits(%arg0: !stream.resource<staging>) -> !stream.resource<staging> {
   %c100 = arith.constant 100 : index
   // CHECK-NOT: stream.timepoint.immediate
   %0 = stream.timepoint.immediate => !stream.timepoint
@@ -169,6 +184,32 @@
 
 // -----
 
+// Tests that the pattern doesn't kick in when it would be unsafe to group the
+// awaits due to operand dependencies.
+
+func.func private @materializeResource0() -> !stream.resource<*>
+func.func private @materializeResource1(!stream.resource<*>) -> !stream.resource<*>
+
+// CHECK-LABEL: @GroupAwaitsByTimepointUnsafe
+func.func @GroupAwaitsByTimepointUnsafe(
+  %arg0: !stream.timepoint
+) -> (!stream.resource<*>, !stream.resource<*>) {
+  %c100 = arith.constant 100 : index
+  %c101 = arith.constant 101 : index
+  // CHECK: call @materializeResource0
+  %r0a = call @materializeResource0() : () -> !stream.resource<*>
+  // CHECK-NEXT: stream.timepoint.await
+  %r0b = stream.timepoint.await %arg0 => %r0a : !stream.resource<*>{%c100}
+  // CHECK-NEXT: call @materializeResource1
+  %r1a = call @materializeResource1(%r0b) : (!stream.resource<*>) -> !stream.resource<*>
+  // CHECK-NEXT: stream.timepoint.await
+  %r1b = stream.timepoint.await %arg0 => %r1a : !stream.resource<*>{%c101}
+  // CHECK-NEXT: return
+  return %r0b, %r1b : !stream.resource<*>, !stream.resource<*>
+}
+
+// -----
+
 // CHECK-LABEL: @FoldDuplicateAwaitResources
 func.func @FoldDuplicateAwaitResources(
   %arg0: !stream.timepoint,
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/timepoint_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/timepoint_ops.mlir
index 94c57a3..15c6d6b 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/timepoint_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/timepoint_ops.mlir
@@ -27,6 +27,15 @@
 
 // -----
 
+// CHECK-LABEL: @timepointChainExternal
+func.func @timepointChainExternal(%arg0: !stream.timepoint, %arg1: !hal.fence) {
+  // CHECK: stream.timepoint.chain_external %arg0 => (%arg1 : !hal.fence)
+  stream.timepoint.chain_external %arg0 => (%arg1 : !hal.fence)
+  return
+}
+
+// -----
+
 // CHECK-LABEL: @timepointJoin
 func.func @timepointJoin(%arg0: !stream.timepoint, %arg1: !stream.timepoint) -> !stream.timepoint {
   // CHECK: = stream.timepoint.join max(%arg0, %arg1) => !stream.timepoint
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/RefineUsage.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/RefineUsage.cpp
index 4660a98..b4719ef 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/RefineUsage.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/RefineUsage.cpp
@@ -338,7 +338,9 @@
   patterns.insert<ApplyInitializerOp, ApplyFuncOp>(context, analysis);
   patterns.insert<ApplyGenericOp<IREE::Util::DoNotOptimizeOp>,
                   ApplyGenericOp<mlir::arith::SelectOp>,
-                  ApplyGenericOp<mlir::func::CallOp>>(context, analysis);
+                  ApplyGenericOp<mlir::func::CallOp>,
+                  ApplyGenericOp<IREE::Stream::TimepointBarrierOp>>(context,
+                                                                    analysis);
   patterns.insert<ApplyStreamableOp<IREE::Stream::ResourceAllocOp>,
                   ApplyStreamableOp<IREE::Stream::ResourceAllocaOp>,
                   ApplyStreamableOp<IREE::Stream::TensorImportOp>,
diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.cpp b/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.cpp
index 411d56d..e07f09c 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.cpp
@@ -401,8 +401,8 @@
 // IREE::Util::SizeAwareTypeInterface
 //===----------------------------------------------------------------------===//
 
-static bool isValueUsableForOp(Value value, Block *block,
-                               Block::iterator insertionPoint) {
+bool isValueUsableForOp(Value value, Block *block,
+                        Block::iterator insertionPoint) {
   if (block == nullptr) {
     // Op is not in a block; can't analyze (maybe?).
     return false;
@@ -428,6 +428,10 @@
   return false;
 }
 
+bool isValueUsableForOp(Value value, Operation *op) {
+  return isValueUsableForOp(value, op->getBlock(), Block::iterator(op));
+}
+
 // static
 Value SizeAwareTypeInterface::findSizeValue(Value resourceValue, Block *block,
                                             Block::iterator insertionPoint) {
diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.h b/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.h
index 6a955f3..82c1b6f 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.h
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.h
@@ -121,6 +121,12 @@
     ArrayRef<unsigned> excludedResultIndices,
     SmallVector<int64_t, 4> &tiedOperandIndices);
 
+// Returns true if |value| can be used by the operation at the insertion point.
+bool isValueUsableForOp(Value value, Block *block,
+                        Block::iterator insertionPoint);
+// Returns true if |value| can be used by |op|.
+bool isValueUsableForOp(Value value, Operation *op);
+
 //===----------------------------------------------------------------------===//
 // Shape-aware interface utilities
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/InputConversion/Common/IREEImportPublic.cpp b/compiler/src/iree/compiler/InputConversion/Common/IREEImportPublic.cpp
index 5363de9..9565dc9 100644
--- a/compiler/src/iree/compiler/InputConversion/Common/IREEImportPublic.cpp
+++ b/compiler/src/iree/compiler/InputConversion/Common/IREEImportPublic.cpp
@@ -105,7 +105,7 @@
       // will get it).
       rewriter.replaceOpWithNewOp<IREE::HAL::TensorImportOp>(
           srcOp, resultType, adaptor.getSource(), TypeAttr::get(resultType),
-          adaptor.getTargetDims());
+          adaptor.getTargetDims(), /*wait_fence=*/Value{});
     }
     return success();
   }
diff --git a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/ConvertStreamToHALInline.cpp b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/ConvertStreamToHALInline.cpp
index 64f48fb..e2c630d 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/ConvertStreamToHALInline.cpp
+++ b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/ConvertStreamToHALInline.cpp
@@ -541,6 +541,18 @@
   }
 };
 
+struct TimepointChainExternalOpPattern
+    : public OpConversionPattern<IREE::Stream::TimepointChainExternalOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult matchAndRewrite(
+      IREE::Stream::TimepointChainExternalOp exportOp, OpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    return rewriter.notifyMatchFailure(
+        exportOp,
+        "timepoints are not supported across the ABI with inline execution");
+  }
+};
+
 struct TimepointJoinOpPattern
     : public OpConversionPattern<IREE::Stream::TimepointJoinOp> {
   using OpConversionPattern::OpConversionPattern;
@@ -552,6 +564,21 @@
   }
 };
 
+struct TimepointBarrierOpPattern
+    : public OpConversionPattern<IREE::Stream::TimepointBarrierOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult matchAndRewrite(
+      IREE::Stream::TimepointBarrierOp barrierOp, OpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOp(barrierOp, {
+                                      adaptor.getResource(),
+                                      rewriter.create<arith::ConstantIntOp>(
+                                          barrierOp.getLoc(), 0, 64),
+                                  });
+    return success();
+  }
+};
+
 struct TimepointAwaitOpPattern
     : public OpConversionPattern<IREE::Stream::TimepointAwaitOp> {
   using OpConversionPattern::OpConversionPattern;
@@ -620,7 +647,8 @@
 
   patterns.insert<GlobalTimepointConversionPattern>(typeConverter, context);
   patterns.insert<TimepointImmediateOpPattern, TimepointImportOpPattern,
-                  TimepointExportOpPattern, TimepointJoinOpPattern,
+                  TimepointExportOpPattern, TimepointChainExternalOpPattern,
+                  TimepointJoinOpPattern, TimepointBarrierOpPattern,
                   TimepointAwaitOpPattern>(typeConverter, context);
 
   patterns.insert<ElideYieldOpPattern>(typeConverter, context);
diff --git a/compiler/src/iree/compiler/Pipelines/Pipelines.cpp b/compiler/src/iree/compiler/Pipelines/Pipelines.cpp
index 82ed296..2272556 100644
--- a/compiler/src/iree/compiler/Pipelines/Pipelines.cpp
+++ b/compiler/src/iree/compiler/Pipelines/Pipelines.cpp
@@ -69,10 +69,14 @@
   buildCommonInputConversionPassPipeline(passManager);
 
   // Now that inputs are legalized, generate wrapper for entry functions.
+  IREE::ABI::InvocationOptions invocationOptions;
+  invocationOptions.invocationModel =
+      schedulingOptions.executionModel ==
+              SchedulingOptions::ExecutionModel::AsyncExternal
+          ? IREE::ABI::InvocationModel::CoarseFences
+          : IREE::ABI::InvocationModel::Sync;
   if (bindingOptions.native) {
-    // TODO(benvanik): pass down execution model to the ABI pipeline so that
-    // it can change default function signature behavior
-    IREE::ABI::buildTransformPassPipeline(passManager);
+    IREE::ABI::buildTransformPassPipeline(passManager, invocationOptions);
   }
   if (bindingOptions.tflite) {
     IREE::TFLite::buildTransformPassPipeline(passManager);
diff --git a/runtime/src/iree/hal/fence.c b/runtime/src/iree/hal/fence.c
index 3d4597b..b5b7254 100644
--- a/runtime/src/iree/hal/fence.c
+++ b/runtime/src/iree/hal/fence.c
@@ -46,6 +46,26 @@
   return iree_ok_status();
 }
 
+IREE_API_EXPORT iree_status_t iree_hal_fence_create_at(
+    iree_hal_semaphore_t* semaphore, uint64_t value,
+    iree_allocator_t host_allocator, iree_hal_fence_t** out_fence) {
+  IREE_ASSERT_ARGUMENT(semaphore);
+  IREE_ASSERT_ARGUMENT(out_fence);
+  *out_fence = NULL;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_hal_fence_t* fence = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_fence_create(1, host_allocator, &fence));
+  iree_status_t status = iree_hal_fence_insert(fence, semaphore, value);
+  if (iree_status_is_ok(status)) {
+    *out_fence = fence;
+  } else {
+    iree_hal_fence_release(fence);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
 // TODO(benvanik): actually join efficiently. Today we just create a fence that
 // can hold the worst-case sum of all fence timepoints and then insert but it
 // could be made much better. In most cases the joined fences have a near
diff --git a/runtime/src/iree/hal/fence.h b/runtime/src/iree/hal/fence.h
index 97174cf..41e7e92 100644
--- a/runtime/src/iree/hal/fence.h
+++ b/runtime/src/iree/hal/fence.h
@@ -51,6 +51,11 @@
     iree_host_size_t capacity, iree_allocator_t host_allocator,
     iree_hal_fence_t** out_fence);
 
+// Creates a new fence with a single timepoint.
+IREE_API_EXPORT iree_status_t iree_hal_fence_create_at(
+    iree_hal_semaphore_t* semaphore, uint64_t value,
+    iree_allocator_t host_allocator, iree_hal_fence_t** out_fence);
+
 // Creates a new fence joining all |fences| as a wait-all operation.
 IREE_API_EXPORT iree_status_t iree_hal_fence_join(
     iree_host_size_t fence_count, iree_hal_fence_t** fences,
diff --git a/runtime/src/iree/tooling/vm_util.c b/runtime/src/iree/tooling/vm_util.c
index e71382f..7aa0cb7 100644
--- a/runtime/src/iree/tooling/vm_util.c
+++ b/runtime/src/iree/tooling/vm_util.c
@@ -167,7 +167,7 @@
   return status;
 }
 
-iree_status_t iree_create_and_parse_to_variant_list(
+iree_status_t iree_tooling_parse_to_variant_list(
     iree_hal_allocator_t* device_allocator, iree_string_view_t* input_strings,
     iree_host_size_t input_strings_count, iree_allocator_t host_allocator,
     iree_vm_list_t** out_list) {
@@ -277,6 +277,50 @@
   return status;
 }
 
+iree_status_t iree_tooling_append_async_fence_inputs(
+    iree_vm_list_t* list, const iree_vm_function_t* function,
+    iree_hal_device_t* device, iree_hal_fence_t* wait_fence,
+    iree_hal_fence_t** out_signal_fence) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_string_view_t model =
+      iree_vm_function_lookup_attr_by_name(function, IREE_SV("iree.abi.model"));
+  if (!iree_string_view_equal(model, IREE_SV("coarse-fences"))) {
+    // Ignore unknown models - the user may have provided their own fences.
+    IREE_TRACE_ZONE_END(z0);
+    return iree_ok_status();
+  }
+
+  // Create the signal fence as a 0->1 transition. The caller will wait on that.
+  iree_hal_semaphore_t* semaphore = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_hal_semaphore_create(device, 0ull, &semaphore));
+  iree_hal_fence_t* signal_fence = NULL;
+  iree_status_t status = iree_hal_fence_create_at(
+      semaphore, 1ull, iree_hal_device_host_allocator(device), &signal_fence);
+  iree_hal_semaphore_release(semaphore);
+
+  // Append (wait, signal) fences.
+  if (iree_status_is_ok(status)) {
+    iree_vm_ref_t wait_fence_ref = iree_hal_fence_retain_ref(wait_fence);
+    status = iree_vm_list_push_ref_move(list, &wait_fence_ref);
+    iree_vm_ref_release(&wait_fence_ref);
+  }
+  if (iree_status_is_ok(status)) {
+    iree_vm_ref_t signal_fence_ref = iree_hal_fence_retain_ref(signal_fence);
+    status = iree_vm_list_push_ref_move(list, &signal_fence_ref);
+    iree_vm_ref_release(&signal_fence_ref);
+  }
+
+  if (iree_status_is_ok(status)) {
+    *out_signal_fence = signal_fence;
+  } else {
+    iree_hal_fence_release(signal_fence);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
 #define IREE_PRINTVARIANT_CASE_I(SIZE, B, V)  \
   case IREE_VM_VALUE_TYPE_I##SIZE:            \
     return iree_string_builder_append_format( \
@@ -325,9 +369,9 @@
   return iree_ok_status();
 }
 
-iree_status_t iree_append_variant_list(iree_vm_list_t* variant_list,
-                                       size_t max_element_count,
-                                       iree_string_builder_t* builder) {
+iree_status_t iree_tooling_append_variant_list_lines(
+    iree_vm_list_t* variant_list, size_t max_element_count,
+    iree_string_builder_t* builder) {
   IREE_TRACE_ZONE_BEGIN(z0);
   for (iree_host_size_t i = 0; i < iree_vm_list_size(variant_list); ++i) {
     iree_vm_variant_t variant = iree_vm_variant_empty();
@@ -342,12 +386,13 @@
   return iree_ok_status();
 }
 
-iree_status_t iree_print_variant_list(iree_vm_list_t* variant_list,
-                                      size_t max_element_count, FILE* file) {
+iree_status_t iree_tooling_variant_list_fprint(iree_vm_list_t* variant_list,
+                                               size_t max_element_count,
+                                               FILE* file) {
   iree_string_builder_t builder;
   iree_string_builder_initialize(iree_allocator_system(), &builder);
-  iree_status_t status =
-      iree_append_variant_list(variant_list, max_element_count, &builder);
+  iree_status_t status = iree_tooling_append_variant_list_lines(
+      variant_list, max_element_count, &builder);
   if (iree_status_is_ok(status)) {
     size_t written = fwrite(iree_string_builder_buffer(&builder), 1,
                             iree_string_builder_size(&builder), file);
diff --git a/runtime/src/iree/tooling/vm_util.h b/runtime/src/iree/tooling/vm_util.h
index 728c113..8264254 100644
--- a/runtime/src/iree/tooling/vm_util.h
+++ b/runtime/src/iree/tooling/vm_util.h
@@ -26,11 +26,20 @@
 // described in iree/hal/api.h
 // Uses |device_allocator| to allocate the buffers.
 // The returned variant list must be freed by the caller.
-iree_status_t iree_create_and_parse_to_variant_list(
+iree_status_t iree_tooling_parse_to_variant_list(
     iree_hal_allocator_t* device_allocator, iree_string_view_t* input_strings,
     iree_host_size_t input_strings_count, iree_allocator_t host_allocator,
     iree_vm_list_t** out_list);
 
+// Appends fences to |list| if the invocation model of |function| requires them.
+// If no |wait_fence| is provided then the invocation will begin immediately.
+// The caller must wait on the returned |out_signal_fence| before accessing the
+// contents of any buffers returned from the invocation.
+iree_status_t iree_tooling_append_async_fence_inputs(
+    iree_vm_list_t* list, const iree_vm_function_t* function,
+    iree_hal_device_t* device, iree_hal_fence_t* wait_fence,
+    iree_hal_fence_t** out_signal_fence);
+
 // Appends a variant list of VM scalars and buffers to |builder|.
 // Prints scalars in the format:
 //   value
@@ -38,13 +47,14 @@
 //   [shape]xtype=[value]
 // described in
 // https://github.com/iree-org/iree/tree/main/iree/hal/api.h
-iree_status_t iree_append_variant_list(iree_vm_list_t* variant_list,
-                                       size_t max_element_count,
-                                       iree_string_builder_t* builder);
+iree_status_t iree_tooling_append_variant_list_lines(
+    iree_vm_list_t* variant_list, size_t max_element_count,
+    iree_string_builder_t* builder);
 
 // Prints a variant list to a file.
-iree_status_t iree_print_variant_list(iree_vm_list_t* variant_list,
-                                      size_t max_element_count, FILE* file);
+iree_status_t iree_tooling_variant_list_fprint(iree_vm_list_t* variant_list,
+                                               size_t max_element_count,
+                                               FILE* file);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/runtime/src/iree/tooling/vm_util_cc.cc b/runtime/src/iree/tooling/vm_util_cc.cc
index f338b5b..7f61c97 100644
--- a/runtime/src/iree/tooling/vm_util_cc.cc
+++ b/runtime/src/iree/tooling/vm_util_cc.cc
@@ -21,7 +21,7 @@
     input_string_views[i].data = input_strings[i].data();
     input_string_views[i].size = input_strings[i].size();
   }
-  return iree_create_and_parse_to_variant_list(
+  return iree_tooling_parse_to_variant_list(
       device_allocator, input_string_views.data(), input_string_views.size(),
       host_allocator, out_list);
 }
@@ -30,8 +30,8 @@
                         std::string* out_string) {
   iree_string_builder_t builder;
   iree_string_builder_initialize(iree_allocator_system(), &builder);
-  IREE_RETURN_IF_ERROR(
-      iree_append_variant_list(variant_list, max_element_count, &builder));
+  IREE_RETURN_IF_ERROR(iree_tooling_append_variant_list_lines(
+      variant_list, max_element_count, &builder));
   out_string->assign(iree_string_builder_buffer(&builder),
                      iree_string_builder_size(&builder));
   iree_string_builder_deinitialize(&builder);
diff --git a/runtime/src/iree/tooling/vm_util_cc.h b/runtime/src/iree/tooling/vm_util_cc.h
index d39b2c3..9af168c 100644
--- a/runtime/src/iree/tooling/vm_util_cc.h
+++ b/runtime/src/iree/tooling/vm_util_cc.h
@@ -46,7 +46,8 @@
 // Prints a variant list to stdout.
 inline Status PrintVariantList(iree_vm_list_t* variant_list,
                                size_t max_element_count = 1024) {
-  return iree_print_variant_list(variant_list, max_element_count, stdout);
+  return iree_tooling_variant_list_fprint(variant_list, max_element_count,
+                                          stdout);
 }
 
 }  // namespace iree
diff --git a/runtime/src/iree/vm/list.c b/runtime/src/iree/vm/list.c
index f8b5d55..689b228 100644
--- a/runtime/src/iree/vm/list.c
+++ b/runtime/src/iree/vm/list.c
@@ -66,6 +66,32 @@
 
 IREE_VM_DEFINE_TYPE_ADAPTERS(iree_vm_list, iree_vm_list_t);
 
+static void iree_vm_list_retain_range(iree_vm_list_t* list,
+                                      iree_host_size_t offset,
+                                      iree_host_size_t length) {
+  switch (list->storage_mode) {
+    case IREE_VM_LIST_STORAGE_MODE_VALUE:
+      // Value types don't need to be retained.
+      break;
+    case IREE_VM_LIST_STORAGE_MODE_REF: {
+      iree_vm_ref_t* ref_storage = (iree_vm_ref_t*)list->storage;
+      for (iree_host_size_t i = offset; i < offset + length; ++i) {
+        iree_vm_ref_retain_inplace(&ref_storage[i]);
+      }
+      break;
+    }
+    case IREE_VM_LIST_STORAGE_MODE_VARIANT: {
+      iree_vm_variant_t* variant_storage = (iree_vm_variant_t*)list->storage;
+      for (iree_host_size_t i = offset; i < offset + length; ++i) {
+        if (iree_vm_type_def_is_ref(&variant_storage[i].type)) {
+          iree_vm_ref_retain_inplace(&variant_storage[i].ref);
+        }
+      }
+      break;
+    }
+  }
+}
+
 static void iree_vm_list_reset_range(iree_vm_list_t* list,
                                      iree_host_size_t offset,
                                      iree_host_size_t length) {
@@ -220,6 +246,31 @@
   IREE_TRACE_ZONE_END(z0);
 }
 
+IREE_API_EXPORT iree_status_t
+iree_vm_list_clone(iree_vm_list_t* source, iree_allocator_t host_allocator,
+                   iree_vm_list_t** out_target) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_host_size_t count = iree_vm_list_size(source);
+  iree_vm_type_def_t element_type = iree_vm_list_element_type(source);
+  iree_vm_list_t* target = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_vm_list_create(&element_type, count, host_allocator, &target));
+  iree_status_t status = iree_vm_list_resize(target, count);
+  if (iree_status_is_ok(status)) {
+    // Copy storage directly. Note that we need to retain any refs contained.
+    memcpy(target->storage, source->storage,
+           target->count * target->element_size);
+    iree_vm_list_retain_range(target, 0, count);
+  }
+  if (iree_status_is_ok(status)) {
+    *out_target = target;
+  } else {
+    iree_vm_list_release(target);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
 IREE_API_EXPORT void iree_vm_list_retain(iree_vm_list_t* list) {
   iree_vm_ref_object_retain(list, &iree_vm_list_descriptor);
 }
@@ -228,10 +279,9 @@
   iree_vm_ref_object_release(list, &iree_vm_list_descriptor);
 }
 
-IREE_API_EXPORT iree_status_t iree_vm_list_element_type(
-    const iree_vm_list_t* list, iree_vm_type_def_t* out_element_type) {
-  *out_element_type = list->element_type;
-  return iree_ok_status();
+IREE_API_EXPORT iree_vm_type_def_t
+iree_vm_list_element_type(const iree_vm_list_t* list) {
+  return list->element_type;
 }
 
 IREE_API_EXPORT iree_host_size_t
@@ -275,6 +325,14 @@
   return iree_ok_status();
 }
 
+IREE_API_EXPORT void iree_vm_list_clear(iree_vm_list_t* list) {
+  if (list->count > 0) {
+    // Truncating.
+    iree_vm_list_reset_range(list, 0, list->count);
+  }
+  list->count = 0;
+}
+
 static void iree_vm_list_convert_value_type(
     const iree_vm_value_t* source_value, iree_vm_value_type_t target_value_type,
     iree_vm_value_t* out_value) {
diff --git a/runtime/src/iree/vm/list.h b/runtime/src/iree/vm/list.h
index bded73b..361b2a8 100644
--- a/runtime/src/iree/vm/list.h
+++ b/runtime/src/iree/vm/list.h
@@ -66,6 +66,12 @@
     const iree_vm_type_def_t* element_type, iree_host_size_t initial_capacity,
     iree_allocator_t allocator, iree_vm_list_t** out_list);
 
+// Shallowly clones |source| into |out_target|.
+// The resulting list will be have its capacity set to the |source| size.
+IREE_API_EXPORT iree_status_t
+iree_vm_list_clone(iree_vm_list_t* source, iree_allocator_t host_allocator,
+                   iree_vm_list_t** out_target);
+
 // Retains the given |list| for the caller.
 IREE_API_EXPORT void iree_vm_list_retain(iree_vm_list_t* list);
 
@@ -73,8 +79,8 @@
 IREE_API_EXPORT void iree_vm_list_release(iree_vm_list_t* list);
 
 // Returns the element type stored in the list.
-IREE_API_EXPORT iree_status_t iree_vm_list_element_type(
-    const iree_vm_list_t* list, iree_vm_type_def_t* out_element_type);
+IREE_API_EXPORT iree_vm_type_def_t
+iree_vm_list_element_type(const iree_vm_list_t* list);
 
 // Returns the capacity of the list in elements.
 IREE_API_EXPORT iree_host_size_t
@@ -95,6 +101,9 @@
 IREE_API_EXPORT iree_status_t iree_vm_list_resize(iree_vm_list_t* list,
                                                   iree_host_size_t new_size);
 
+// Clears the list contents. Equivalent to resizing to 0.
+IREE_API_EXPORT void iree_vm_list_clear(iree_vm_list_t* list);
+
 // Returns the value of the element at the given index.
 // Note that the value type may vary from element to element in variant lists
 // and callers should check the |out_value| type.
diff --git a/runtime/src/iree/vm/list_test.cc b/runtime/src/iree/vm/list_test.cc
index 979ac02..38ef339 100644
--- a/runtime/src/iree/vm/list_test.cc
+++ b/runtime/src/iree/vm/list_test.cc
@@ -89,8 +89,7 @@
   IREE_ASSERT_OK(iree_vm_list_create(&element_type, initial_capacity,
                                      iree_allocator_system(), &list));
 
-  iree_vm_type_def_t queried_element_type;
-  IREE_ASSERT_OK(iree_vm_list_element_type(list, &queried_element_type));
+  iree_vm_type_def_t queried_element_type = iree_vm_list_element_type(list);
   EXPECT_TRUE(iree_vm_type_def_is_value(&queried_element_type));
   EXPECT_EQ(0,
             memcmp(&element_type, &queried_element_type, sizeof(element_type)));
@@ -126,8 +125,7 @@
   IREE_ASSERT_OK(iree_vm_list_create(&element_type, initial_capacity,
                                      iree_allocator_system(), &list));
 
-  iree_vm_type_def_t queried_element_type;
-  IREE_ASSERT_OK(iree_vm_list_element_type(list, &queried_element_type));
+  iree_vm_type_def_t queried_element_type = iree_vm_list_element_type(list);
   EXPECT_TRUE(iree_vm_type_def_is_ref(&queried_element_type));
   EXPECT_EQ(0,
             memcmp(&element_type, &queried_element_type, sizeof(element_type)));
@@ -163,8 +161,7 @@
   IREE_ASSERT_OK(iree_vm_list_create(&element_type, initial_capacity,
                                      iree_allocator_system(), &list));
 
-  iree_vm_type_def_t queried_element_type;
-  IREE_ASSERT_OK(iree_vm_list_element_type(list, &queried_element_type));
+  iree_vm_type_def_t queried_element_type = iree_vm_list_element_type(list);
   EXPECT_TRUE(iree_vm_type_def_is_variant(&queried_element_type));
   EXPECT_LE(initial_capacity, iree_vm_list_capacity(list));
   EXPECT_EQ(0, iree_vm_list_size(list));
@@ -200,6 +197,202 @@
   iree_vm_list_release(list);
 }
 
+// Tests cloning lists of value types.
+TEST_F(VMListTest, CloneValuesEmpty) {
+  // Create source list.
+  iree_vm_type_def_t element_type =
+      iree_vm_type_def_make_value_type(IREE_VM_VALUE_TYPE_I32);
+  iree_host_size_t initial_capacity = 123;
+  iree_vm_list_t* source_list = nullptr;
+  IREE_ASSERT_OK(iree_vm_list_create(&element_type, initial_capacity,
+                                     iree_allocator_system(), &source_list));
+
+  // Clone list.
+  iree_vm_list_t* target_list = NULL;
+  IREE_ASSERT_OK(
+      iree_vm_list_clone(source_list, iree_allocator_system(), &target_list));
+
+  // Verify the target list matches source parameters.
+  iree_vm_type_def_t queried_element_type =
+      iree_vm_list_element_type(target_list);
+  EXPECT_TRUE(iree_vm_type_def_is_value(&queried_element_type));
+  EXPECT_EQ(0,
+            memcmp(&element_type, &queried_element_type, sizeof(element_type)));
+  EXPECT_LE(iree_vm_list_capacity(target_list),
+            iree_vm_list_capacity(source_list));
+  EXPECT_EQ(iree_vm_list_size(target_list), iree_vm_list_size(source_list));
+
+  iree_vm_list_release(source_list);
+  iree_vm_list_release(target_list);
+}
+TEST_F(VMListTest, CloneValues) {
+  // Create source list.
+  iree_vm_type_def_t element_type =
+      iree_vm_type_def_make_value_type(IREE_VM_VALUE_TYPE_I32);
+  iree_host_size_t initial_capacity = 123;
+  iree_vm_list_t* source_list = nullptr;
+  IREE_ASSERT_OK(iree_vm_list_create(&element_type, initial_capacity,
+                                     iree_allocator_system(), &source_list));
+  IREE_ASSERT_OK(iree_vm_list_resize(source_list, 5));
+  EXPECT_EQ(5, iree_vm_list_size(source_list));
+  for (iree_host_size_t i = 0; i < 5; ++i) {
+    iree_vm_value_t value = iree_vm_value_make_i32((int32_t)i);
+    IREE_ASSERT_OK(iree_vm_list_set_value(source_list, i, &value));
+  }
+
+  // Clone list.
+  iree_vm_list_t* target_list = NULL;
+  IREE_ASSERT_OK(
+      iree_vm_list_clone(source_list, iree_allocator_system(), &target_list));
+
+  // Verify the contents match.
+  EXPECT_EQ(iree_vm_list_size(target_list), iree_vm_list_size(source_list));
+  for (iree_host_size_t i = 0; i < 5; ++i) {
+    iree_vm_value_t value;
+    IREE_ASSERT_OK(iree_vm_list_get_value_as(target_list, i,
+                                             IREE_VM_VALUE_TYPE_I32, &value));
+    EXPECT_EQ(IREE_VM_VALUE_TYPE_I32, value.type);
+    EXPECT_EQ(i, value.i32);
+  }
+
+  iree_vm_list_release(source_list);
+  iree_vm_list_release(target_list);
+}
+
+// Tests cloning lists of ref types.
+TEST_F(VMListTest, CloneRefsEmpty) {
+  iree_vm_type_def_t element_type =
+      iree_vm_type_def_make_ref_type(test_a_type_id());
+  iree_vm_list_t* source_list = nullptr;
+  IREE_ASSERT_OK(iree_vm_list_create(&element_type, 8, iree_allocator_system(),
+                                     &source_list));
+
+  // Clone list.
+  iree_vm_list_t* target_list = NULL;
+  IREE_ASSERT_OK(
+      iree_vm_list_clone(source_list, iree_allocator_system(), &target_list));
+
+  // Verify the target list matches source parameters.
+  iree_vm_type_def_t queried_element_type =
+      iree_vm_list_element_type(target_list);
+  EXPECT_TRUE(iree_vm_type_def_is_ref(&queried_element_type));
+  EXPECT_EQ(0,
+            memcmp(&element_type, &queried_element_type, sizeof(element_type)));
+  EXPECT_LE(iree_vm_list_capacity(target_list),
+            iree_vm_list_capacity(source_list));
+  EXPECT_EQ(iree_vm_list_size(target_list), iree_vm_list_size(source_list));
+
+  iree_vm_list_release(source_list);
+  iree_vm_list_release(target_list);
+}
+TEST_F(VMListTest, CloneRefs) {
+  iree_vm_type_def_t element_type =
+      iree_vm_type_def_make_ref_type(test_a_type_id());
+  iree_vm_list_t* source_list = nullptr;
+  IREE_ASSERT_OK(iree_vm_list_create(&element_type, 8, iree_allocator_system(),
+                                     &source_list));
+  IREE_ASSERT_OK(iree_vm_list_resize(source_list, 5));
+  EXPECT_EQ(5, iree_vm_list_size(source_list));
+  for (iree_host_size_t i = 0; i < 5; ++i) {
+    iree_vm_ref_t ref_a = MakeRef<A>((float)i);
+    IREE_ASSERT_OK(iree_vm_list_set_ref_move(source_list, i, &ref_a));
+  }
+
+  // Clone list.
+  iree_vm_list_t* target_list = NULL;
+  IREE_ASSERT_OK(
+      iree_vm_list_clone(source_list, iree_allocator_system(), &target_list));
+
+  // Verify the contents match. Since they are refs we compare pointer equality
+  // to ensure they were shallowly cloned.
+  EXPECT_EQ(iree_vm_list_size(target_list), iree_vm_list_size(source_list));
+  for (iree_host_size_t i = 0; i < 5; ++i) {
+    iree_vm_ref_t source_ref_a{0};
+    IREE_ASSERT_OK(iree_vm_list_get_ref_assign(source_list, i, &source_ref_a));
+    EXPECT_TRUE(test_a_isa(source_ref_a));
+    auto* source_a = test_a_deref(source_ref_a);
+    iree_vm_ref_t target_ref_a{0};
+    IREE_ASSERT_OK(iree_vm_list_get_ref_assign(target_list, i, &target_ref_a));
+    EXPECT_TRUE(test_a_isa(target_ref_a));
+    auto* target_a = test_a_deref(target_ref_a);
+    EXPECT_EQ(source_a, target_a);
+  }
+
+  iree_vm_list_release(source_list);
+  iree_vm_list_release(target_list);
+}
+
+// Tests cloning lists of variant types.
+TEST_F(VMListTest, CloneVariantsEmpty) {
+  iree_vm_type_def_t element_type = iree_vm_type_def_make_variant_type();
+  iree_vm_list_t* source_list = nullptr;
+  IREE_ASSERT_OK(iree_vm_list_create(&element_type, 10, iree_allocator_system(),
+                                     &source_list));
+
+  // Clone list.
+  iree_vm_list_t* target_list = NULL;
+  IREE_ASSERT_OK(
+      iree_vm_list_clone(source_list, iree_allocator_system(), &target_list));
+
+  // Verify the target list matches source parameters.
+  iree_vm_type_def_t queried_element_type =
+      iree_vm_list_element_type(target_list);
+  EXPECT_TRUE(iree_vm_type_def_is_variant(&queried_element_type));
+  EXPECT_EQ(0,
+            memcmp(&element_type, &queried_element_type, sizeof(element_type)));
+  EXPECT_LE(iree_vm_list_capacity(target_list),
+            iree_vm_list_capacity(source_list));
+  EXPECT_EQ(iree_vm_list_size(target_list), iree_vm_list_size(source_list));
+
+  iree_vm_list_release(source_list);
+  iree_vm_list_release(target_list);
+}
+TEST_F(VMListTest, CloneVariants) {
+  iree_vm_type_def_t element_type = iree_vm_type_def_make_variant_type();
+  iree_vm_list_t* source_list = nullptr;
+  IREE_ASSERT_OK(iree_vm_list_create(&element_type, 10, iree_allocator_system(),
+                                     &source_list));
+  IREE_ASSERT_OK(iree_vm_list_resize(source_list, 10));
+  EXPECT_EQ(10, iree_vm_list_size(source_list));
+  for (iree_host_size_t i = 0; i < 5; ++i) {
+    iree_vm_value_t value = iree_vm_value_make_i32((int32_t)i);
+    IREE_ASSERT_OK(iree_vm_list_set_value(source_list, i, &value));
+  }
+  for (iree_host_size_t i = 5; i < 10; ++i) {
+    iree_vm_ref_t ref_a = MakeRef<A>(static_cast<float>(i));
+    IREE_ASSERT_OK(iree_vm_list_set_ref_move(source_list, i, &ref_a));
+  }
+
+  // Clone list.
+  iree_vm_list_t* target_list = NULL;
+  IREE_ASSERT_OK(
+      iree_vm_list_clone(source_list, iree_allocator_system(), &target_list));
+
+  // Verify the contents match. Since they are refs we compare pointer equality
+  // to ensure they were shallowly cloned.
+  for (iree_host_size_t i = 0; i < 5; ++i) {
+    iree_vm_value_t value;
+    IREE_ASSERT_OK(iree_vm_list_get_value_as(target_list, i,
+                                             IREE_VM_VALUE_TYPE_I32, &value));
+    EXPECT_EQ(IREE_VM_VALUE_TYPE_I32, value.type);
+    EXPECT_EQ(i, value.i32);
+  }
+  for (iree_host_size_t i = 5; i < 10; ++i) {
+    iree_vm_ref_t source_ref_a{0};
+    IREE_ASSERT_OK(iree_vm_list_get_ref_assign(source_list, i, &source_ref_a));
+    EXPECT_TRUE(test_a_isa(source_ref_a));
+    auto* source_a = test_a_deref(source_ref_a);
+    iree_vm_ref_t target_ref_a{0};
+    IREE_ASSERT_OK(iree_vm_list_get_ref_assign(target_list, i, &target_ref_a));
+    EXPECT_TRUE(test_a_isa(target_ref_a));
+    auto* target_a = test_a_deref(target_ref_a);
+    EXPECT_EQ(source_a, target_a);
+  }
+
+  iree_vm_list_release(source_list);
+  iree_vm_list_release(target_list);
+}
+
 // Tests capacity reservation.
 TEST_F(VMListTest, Reserve) {
   // Allocate with 0 initial capacity (which may get rounded up).
diff --git a/runtime/src/iree/vm/ref.c b/runtime/src/iree/vm/ref.c
index 7c2a966..77759e0 100644
--- a/runtime/src/iree/vm/ref.c
+++ b/runtime/src/iree/vm/ref.c
@@ -156,6 +156,15 @@
   return iree_ok_status();
 }
 
+IREE_API_EXPORT void iree_vm_ref_retain_inplace(iree_vm_ref_t* ref) {
+  if (ref->ptr) {
+    volatile iree_atomic_ref_count_t* counter =
+        iree_vm_get_ref_counter_ptr(ref);
+    iree_atomic_ref_count_inc(counter);
+    iree_vm_ref_trace("RETAIN", ref);
+  }
+}
+
 IREE_API_EXPORT void iree_vm_ref_retain(iree_vm_ref_t* ref,
                                         iree_vm_ref_t* out_ref) {
   // NOTE: ref and out_ref may alias or be nested so we retain before we
diff --git a/runtime/src/iree/vm/ref.h b/runtime/src/iree/vm/ref.h
index 1fff332..4351da9 100644
--- a/runtime/src/iree/vm/ref.h
+++ b/runtime/src/iree/vm/ref.h
@@ -185,6 +185,9 @@
 }
 
 // Retains the reference-counted pointer |ref|.
+IREE_API_EXPORT void iree_vm_ref_retain_inplace(iree_vm_ref_t* ref);
+
+// Retains the reference-counted pointer |ref|.
 // |out_ref| will be released if it already contains a reference.
 IREE_API_EXPORT void iree_vm_ref_retain(iree_vm_ref_t* ref,
                                         iree_vm_ref_t* out_ref);
diff --git a/tools/BUILD b/tools/BUILD
index aaf20b6..b94d385 100644
--- a/tools/BUILD
+++ b/tools/BUILD
@@ -31,6 +31,7 @@
         "//runtime/src/iree/base:tracing",
         "//runtime/src/iree/base/internal:flags",
         "//runtime/src/iree/hal",
+        "//runtime/src/iree/modules/hal:types",
         "//runtime/src/iree/tooling:context_util",
         "//runtime/src/iree/tooling:device_util",
         "//runtime/src/iree/tooling:vm_util_cc",
@@ -133,6 +134,7 @@
         "//runtime/src/iree/base:tracing",
         "//runtime/src/iree/base/internal:flags",
         "//runtime/src/iree/hal",
+        "//runtime/src/iree/modules/hal:types",
         "//runtime/src/iree/tooling:context_util",
         "//runtime/src/iree/tooling:device_util",
         "//runtime/src/iree/tooling:vm_util_cc",
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 3f8282e..4189a65 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -51,6 +51,7 @@
     iree::base::internal::flags
     iree::base::tracing
     iree::hal
+    iree::modules::hal::types
     iree::tooling::context_util
     iree::tooling::device_util
     iree::tooling::vm_util_cc
@@ -257,6 +258,7 @@
       iree::compiler::Tools::init_passes_and_dialects
       iree::compiler::Tools::init_targets
       iree::hal
+      iree::modules::hal::types
       iree::tooling::context_util
       iree::tooling::device_util
       iree::tooling::vm_util_cc
diff --git a/tools/iree-benchmark-module-main.cc b/tools/iree-benchmark-module-main.cc
index 0ff459a..296b7a6 100644
--- a/tools/iree-benchmark-module-main.cc
+++ b/tools/iree-benchmark-module-main.cc
@@ -66,6 +66,7 @@
 #include "iree/base/status_cc.h"
 #include "iree/base/tracing.h"
 #include "iree/hal/api.h"
+#include "iree/modules/hal/types.h"
 #include "iree/tooling/context_util.h"
 #include "iree/tooling/vm_util_cc.h"
 #include "iree/vm/api.h"
@@ -77,10 +78,12 @@
 
 // TODO(hanchung): Extract the batch size using
 // iree_vm_function_lookup_attr_by_name.
-IREE_FLAG(
-    int32_t, batch_size, 1,
-    "The number of batch size, which is expected to match "
-    "iree-hal-benchmark-dispatch-repeat-count when translating the module");
+IREE_FLAG(int32_t, batch_size, 1,
+          "Number of invocations per iteration, which for dispatch benchmarks "
+          "must match the --iree-hal-benchmark-dispatch-repeat-count value "
+          "used during compilation.");
+IREE_FLAG(int32_t, batch_concurrency, 1,
+          "Number of invocations within a batch that should run concurrently.");
 
 IREE_FLAG(string, entry_function, "",
           "Name of a function contained in the module specified by module_file "
@@ -178,7 +181,8 @@
 namespace {
 
 static void BenchmarkGenericFunction(const std::string& benchmark_name,
-                                     int batch_size, iree_vm_context_t* context,
+                                     int32_t batch_size,
+                                     iree_vm_context_t* context,
                                      iree_vm_function_t function,
                                      iree_vm_list_t* inputs,
                                      benchmark::State& state) {
@@ -206,10 +210,9 @@
                               iree_vm_function_t function,
                               iree_vm_list_t* inputs) {
   auto benchmark_name = "BM_" + function_name;
-  int batch_size = FLAG_batch_size;
+  int32_t batch_size = FLAG_batch_size;
   benchmark::RegisterBenchmark(benchmark_name.c_str(),
-                               [benchmark_name, batch_size, context, function,
-                                inputs](benchmark::State& state) -> void {
+                               [=](benchmark::State& state) -> void {
                                  BenchmarkGenericFunction(
                                      benchmark_name, batch_size, context,
                                      function, inputs, state);
@@ -225,6 +228,155 @@
                                   : benchmark::kMillisecond);
 }
 
+// Runs up to |batch_size| pipelined invocations in sequence along with
+// concurrency. Example:
+//   batch_size=1, concurrency=1:
+//     [invocation 0]
+//   batch_size=2, concurrency=1:
+//     [invocation 0] -> [invocation 1]
+//   batch_size=2, concurrency=2:
+//     [invocation 0]
+//     [invocation 1]
+//   batch_size=4, concurrency=2:
+//     [invocation 0] -> [invocation 2]
+//     [invocation 1] -> [invocation 3]
+static void BenchmarkAsyncFunction(
+    const std::string& benchmark_name, int32_t batch_size,
+    int32_t batch_concurrency, iree_hal_device_t* device,
+    iree_vm_context_t* context, iree_vm_function_t function,
+    iree_vm_list_t* common_inputs, benchmark::State& state) {
+  IREE_TRACE_SCOPE_DYNAMIC(benchmark_name.c_str());
+  IREE_TRACE_FRAME_MARK();
+  iree_allocator_t host_allocator = iree_allocator_system();
+
+  // Round up batch size to some multiple of concurrency.
+  batch_size = (int32_t)iree_host_align(batch_size, batch_concurrency);
+
+  // Benchmarking loop.
+  while (state.KeepRunningBatch(batch_size)) {
+    IREE_TRACE_SCOPE0("BenchmarkIteration");
+    IREE_TRACE_FRAME_MARK_NAMED("Iteration");
+
+    state.PauseTiming();
+
+    IREE_TRACE_ZONE_BEGIN_NAMED(z_begin, "PrepareBatch");
+
+    // Each concurrent track of execution gets its own semaphore.
+    std::vector<vm::ref<iree_hal_semaphore_t>> timeline_semaphores;
+    for (int32_t i = 0; i < batch_concurrency; ++i) {
+      vm::ref<iree_hal_semaphore_t> timeline_semaphore;
+      IREE_CHECK_OK(
+          iree_hal_semaphore_create(device, 0ull, &timeline_semaphore));
+      timeline_semaphores.push_back(std::move(timeline_semaphore));
+    }
+
+    // Preallocate fences and I/O for each invocation.
+    // The same inputs are used for each but we need a unique list to hold the
+    // unique fences. Each fence represents when the invocation has completed.
+    std::vector<vm::ref<iree_hal_fence_t>> invocation_fences;
+    std::vector<vm::ref<iree_vm_list_t>> invocation_inputs;
+    std::vector<vm::ref<iree_vm_list_t>> invocation_outputs;
+    vm::ref<iree_hal_fence_t> completion_fence;
+    IREE_CHECK_OK(iree_hal_fence_create(batch_concurrency, host_allocator,
+                                        &completion_fence));
+    for (int32_t i = 0; i < batch_size / batch_concurrency; ++i) {
+      for (int32_t j = 0; j < batch_concurrency; ++j) {
+        // Chain each concurrent minibatch to the previous. Note that to start
+        // we wait on nothing and begin executing immediately.
+        vm::ref<iree_hal_fence_t> wait_fence;
+        if (i > 0) {
+          wait_fence = vm::retain_ref(
+              invocation_fences[(i - 1) * batch_concurrency + j]);
+        }
+        uint64_t signal_value = i + 1;
+        vm::ref<iree_hal_fence_t> signal_fence;
+        IREE_CHECK_OK(iree_hal_fence_create_at(timeline_semaphores[j].get(),
+                                               signal_value, host_allocator,
+                                               &signal_fence));
+        invocation_fences.push_back(vm::retain_ref(signal_fence));
+
+        // Join the final minibatch on the completion fence.
+        if (i == batch_size / batch_concurrency - 1) {
+          IREE_CHECK_OK(iree_hal_fence_insert(completion_fence.get(),
+                                              timeline_semaphores[j].get(),
+                                              signal_value));
+        }
+
+        // Clone common inputs and add the invocation-specific fences.
+        vm::ref<iree_vm_list_t> inputs;
+        IREE_CHECK_OK(
+            iree_vm_list_clone(common_inputs, host_allocator, &inputs));
+        IREE_CHECK_OK(iree_vm_list_push_ref_move(inputs.get(), wait_fence));
+        IREE_CHECK_OK(iree_vm_list_push_ref_move(inputs.get(), signal_fence));
+        invocation_inputs.push_back(std::move(inputs));
+
+        // Setup empty outputs.
+        vm::ref<iree_vm_list_t> outputs;
+        IREE_CHECK_OK(iree_vm_list_create(/*element_type=*/nullptr, 16,
+                                          host_allocator, &outputs));
+        invocation_outputs.push_back(std::move(outputs));
+      }
+    }
+
+    IREE_TRACE_ZONE_END(z_begin);
+
+    state.ResumeTiming();
+    {
+      // TODO(benvanik): replace with async invocations. Today if the invocation
+      // performs any waits this will block on the initial invoke instead of
+      // actually overlapping things.
+      for (int32_t i = 0; i < batch_size; ++i) {
+        IREE_CHECK_OK(
+            iree_vm_invoke(context, function, IREE_VM_INVOCATION_FLAG_NONE,
+                           /*policy=*/nullptr, invocation_inputs[i].get(),
+                           invocation_outputs[i].get(), host_allocator));
+      }
+      IREE_CHECK_OK(
+          iree_hal_fence_wait(completion_fence.get(), iree_infinite_timeout()));
+    }
+    state.PauseTiming();
+
+    IREE_TRACE_ZONE_BEGIN_NAMED(z_end, "CleanupBatch");
+    for (int32_t i = 0; i < batch_size; ++i) {
+      iree_vm_list_clear(invocation_outputs[i].get());
+    }
+    invocation_fences.clear();
+    invocation_inputs.clear();
+    invocation_outputs.clear();
+    completion_fence.reset();
+    timeline_semaphores.clear();
+    IREE_TRACE_ZONE_END(z_end);
+
+    state.ResumeTiming();
+  }
+  state.SetItemsProcessed(state.iterations());
+}
+
+void RegisterAsyncBenchmark(const std::string& function_name,
+                            iree_hal_device_t* device,
+                            iree_vm_context_t* context,
+                            iree_vm_function_t function,
+                            iree_vm_list_t* inputs) {
+  auto benchmark_name = "BM_" + function_name;
+  int32_t batch_size = FLAG_batch_size;
+  int32_t batch_concurrency = FLAG_batch_concurrency;
+  benchmark::RegisterBenchmark(
+      benchmark_name.c_str(),
+      [=](benchmark::State& state) -> void {
+        BenchmarkAsyncFunction(benchmark_name, batch_size, batch_concurrency,
+                               device, context, function, inputs, state);
+      })
+      // By default only the main thread is included in CPU time. Include all
+      // the threads instead.
+      ->MeasureProcessCPUTime()
+      // To make single and multi-threaded benchmarks more comparable, use the
+      // wall time to determine how many iterations to run. See
+      // https://github.com/google/benchmark#cpu-timers,
+      ->UseRealTime()
+      ->Unit(FLAG_time_unit.first ? FLAG_time_unit.second
+                                  : benchmark::kMillisecond);
+}
+
 static void BenchmarkDispatchFunction(const std::string& benchmark_name,
                                       iree_vm_context_t* context,
                                       iree_vm_function_t function,
@@ -350,7 +502,18 @@
         iree::span<const std::string>{FLAG_function_inputs.data(),
                                       FLAG_function_inputs.size()},
         iree_vm_instance_allocator(instance_), &inputs_));
-    RegisterGenericBenchmark(function_name, context_, function, inputs_.get());
+
+    iree_string_view_t invocation_model = iree_vm_function_lookup_attr_by_name(
+        &function, IREE_SV("iree.abi.model"));
+    if (iree_string_view_equal(invocation_model, IREE_SV("coarse-fences"))) {
+      // Asynchronous invocation.
+      iree::RegisterAsyncBenchmark(function_name, device_, context_, function,
+                                   inputs_.get());
+    } else {
+      // Synchronous invocation.
+      iree::RegisterGenericBenchmark(function_name, context_, function,
+                                     inputs_.get());
+    }
     return iree_ok_status();
   }
 
@@ -387,22 +550,37 @@
           continue;
         }
 
+        // Query function information to determine how to run it.
         iree_vm_function_signature_t signature =
             iree_vm_function_signature(&function);
         iree_host_size_t argument_count = 0;
         iree_host_size_t result_count = 0;
         IREE_RETURN_IF_ERROR(iree_vm_function_call_count_arguments_and_results(
             &signature, &argument_count, &result_count));
-        if (argument_count) {
-          // Only functions with no inputs are run (because we can't pass
-          // anything).
-          continue;
+        iree_string_view_t invocation_model =
+            iree_vm_function_lookup_attr_by_name(&function,
+                                                 IREE_SV("iree.abi.model"));
+        if (iree_string_view_equal(invocation_model,
+                                   IREE_SV("coarse-fences"))) {
+          // Asynchronous invocation with coarse fences. Expect just those.
+          if (argument_count == 2) {
+            // Only functions taking a (wait, signal) fence pair are run.
+            iree::RegisterAsyncBenchmark(
+                std::string(function_name.data, function_name.size), device_,
+                context_, function,
+                /*inputs=*/nullptr);
+          }
+        } else {
+          // Basic synchronous invocation.
+          if (argument_count == 0) {
+            // Only functions with no inputs are run (because we can't pass
+            // anything).
+            iree::RegisterGenericBenchmark(
+                std::string(function_name.data, function_name.size), context_,
+                function,
+                /*inputs=*/nullptr);
+          }
         }
-
-        iree::RegisterGenericBenchmark(
-            std::string(function_name.data, function_name.size), context_,
-            function,
-            /*inputs=*/nullptr);
       }
     }
     return iree_ok_status();
diff --git a/tools/iree-e2e-matmul-test.c b/tools/iree-e2e-matmul-test.c
index 75f09b8..8f282e6 100644
--- a/tools/iree-e2e-matmul-test.c
+++ b/tools/iree-e2e-matmul-test.c
@@ -172,8 +172,7 @@
 static iree_status_t copy_device_buffer_views_to_host(
     iree_hal_device_t* device, iree_hal_allocator_t* hal_allocator,
     iree_vm_list_t* src, iree_vm_list_t** dst) {
-  iree_vm_type_def_t elem_type;
-  IREE_RETURN_IF_ERROR(iree_vm_list_element_type(src, &elem_type));
+  iree_vm_type_def_t elem_type = iree_vm_list_element_type(src);
   iree_host_size_t size = iree_vm_list_size(src);
   iree_allocator_t allocator = iree_hal_allocator_host_allocator(hal_allocator);
   IREE_RETURN_IF_ERROR(iree_vm_list_create(&elem_type, size, allocator, dst));
@@ -816,8 +815,7 @@
 static iree_status_t mask_and_copy_device_buffer_views_to_device(
     iree_hal_device_t* device, iree_hal_allocator_t* hal_allocator,
     iree_vm_list_t* src_list, matrix_mask_t* mask, iree_vm_list_t** dst_list) {
-  iree_vm_type_def_t elem_type;
-  IREE_RETURN_IF_ERROR(iree_vm_list_element_type(src_list, &elem_type));
+  iree_vm_type_def_t elem_type = iree_vm_list_element_type(src_list);
   iree_host_size_t size = iree_vm_list_size(src_list);
   iree_allocator_t allocator = iree_hal_allocator_host_allocator(hal_allocator);
   IREE_RETURN_IF_ERROR(
diff --git a/tools/iree-run-mlir-main.cc b/tools/iree-run-mlir-main.cc
index 3f2d9ca..0fd1518 100644
--- a/tools/iree-run-mlir-main.cc
+++ b/tools/iree-run-mlir-main.cc
@@ -49,6 +49,7 @@
 #include "iree/compiler/Tools/init_dialects.h"
 #include "iree/compiler/Tools/init_targets.h"
 #include "iree/hal/api.h"
+#include "iree/modules/hal/types.h"
 #include "iree/tooling/context_util.h"
 #include "iree/tooling/device_util.h"
 #include "iree/tooling/vm_util_cc.h"
@@ -314,11 +315,12 @@
 }
 
 // Evaluates a single function in its own fiber, printing the results to stdout.
-Status EvaluateFunction(iree_vm_context_t* context,
+Status EvaluateFunction(iree_vm_context_t* context, iree_hal_device_t* device,
                         iree_hal_allocator_t* device_allocator,
                         iree_vm_function_t function,
                         iree_string_view_t function_name) {
   IREE_TRACE_SCOPE();
+  iree_allocator_t host_allocator = iree_allocator_system();
 
   printf("EXEC @%.*s\n", (int)function_name.size, function_name.data);
 
@@ -328,18 +330,28 @@
       device_allocator,
       iree::span<const std::string>{FLAG_function_inputs.data(),
                                     FLAG_function_inputs.size()},
-      iree_allocator_system(), &inputs));
+      host_allocator, &inputs));
+
+  // If the function is async add fences so we can invoke it synchronously.
+  vm::ref<iree_hal_fence_t> finish_fence;
+  IREE_RETURN_IF_ERROR(iree_tooling_append_async_fence_inputs(
+      inputs.get(), &function, device, /*wait_fence=*/NULL, &finish_fence));
 
   // Prepare outputs list to accept the results from the invocation.
   vm::ref<iree_vm_list_t> outputs;
   IREE_RETURN_IF_ERROR(iree_vm_list_create(/*element_type=*/nullptr, 16,
-                                           iree_allocator_system(), &outputs));
+                                           host_allocator, &outputs));
 
   // Synchronously invoke the function.
-  IREE_RETURN_IF_ERROR(iree_vm_invoke(context, function,
-                                      IREE_VM_INVOCATION_FLAG_NONE,
-                                      /*policy=*/nullptr, inputs.get(),
-                                      outputs.get(), iree_allocator_system()));
+  IREE_RETURN_IF_ERROR(iree_vm_invoke(
+      context, function, IREE_VM_INVOCATION_FLAG_NONE,
+      /*policy=*/nullptr, inputs.get(), outputs.get(), host_allocator));
+
+  // If the function is async we need to wait for it to complete.
+  if (!!finish_fence) {
+    IREE_RETURN_IF_ERROR(
+        iree_hal_fence_wait(finish_fence.get(), iree_infinite_timeout()));
+  }
 
   // Print outputs.
   IREE_RETURN_IF_ERROR(PrintVariantList(outputs.get()));
@@ -398,9 +410,9 @@
         iree_allocator_system(), &context, &device, &device_allocator));
 
     // Invoke the function and print results.
-    IREE_RETURN_IF_ERROR(
-        EvaluateFunction(context, device_allocator, function, function_name),
-        "evaluating export function %d", ordinal);
+    IREE_RETURN_IF_ERROR(EvaluateFunction(context, device, device_allocator,
+                                          function, function_name),
+                         "evaluating export function %d", ordinal);
 
     iree_vm_context_release(context);
     iree_hal_allocator_release(device_allocator);
diff --git a/tools/iree-run-module-main.cc b/tools/iree-run-module-main.cc
index 08d0687..00dc927 100644
--- a/tools/iree-run-module-main.cc
+++ b/tools/iree-run-module-main.cc
@@ -127,6 +127,11 @@
                                     FLAG_function_inputs.size()},
       host_allocator, &inputs));
 
+  // If the function is async add fences so we can invoke it synchronously.
+  vm::ref<iree_hal_fence_t> finish_fence;
+  IREE_RETURN_IF_ERROR(iree_tooling_append_async_fence_inputs(
+      inputs.get(), &function, device, /*wait_fence=*/NULL, &finish_fence));
+
   vm::ref<iree_vm_list_t> outputs;
   IREE_RETURN_IF_ERROR(iree_vm_list_create(/*element_type=*/nullptr, 16,
                                            host_allocator, &outputs));
@@ -138,6 +143,12 @@
                      host_allocator),
       "invoking function '%s'", function_name.c_str());
 
+  // If the function is async we need to wait for it to complete.
+  if (!!finish_fence) {
+    IREE_RETURN_IF_ERROR(
+        iree_hal_fence_wait(finish_fence.get(), iree_infinite_timeout()));
+  }
+
   if (FLAG_expected_outputs.empty()) {
     IREE_RETURN_IF_ERROR(
         PrintVariantList(outputs.get(), (size_t)FLAG_print_max_element_count),