Adding hal.dispatch.extern op. (#15193)

This is an inline dispatch op on tensors that can be inserted in source
programs to fully specify an externally derived dispatch region.
Improvements were made for external executables in flow/stream/hal.

There's a decent amount of cruft on these paths that'll need to be
cleaned up; particularly MaterializeInterfaces has long been in need of
a reworking (likely decomposing with the assistance of a new op or two)
and it's not great that flow now has handling of a HAL pseudo-op. The
stream convert HAL->stream which handles hal.tensor.import/export would
be a better place but executable deduplication happens in flow today and
we need inlined dispatches like this to deduplicate. I toyed with also
having a DeduplicateExecutables in stream but decided against it for
today.
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/ConvertToSPIRVPass.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/ConvertToSPIRVPass.cpp
index 8cd88bb..83de13f 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/ConvertToSPIRVPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/ConvertToSPIRVPass.cpp
@@ -367,6 +367,9 @@
   MLIRContext *context = &getContext();
   ModuleOp moduleOp = getOperation();
 
+  if (moduleOp.getBody()->empty())
+    return;
+
   llvm::StringMap<IREE::HAL::ExecutableExportOp> exportOps =
       getAllEntryPoints(moduleOp);
   for (auto funcOp : moduleOp.getOps<func::FuncOp>()) {
diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
index cd2876e..0fc10cf 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
@@ -63,14 +63,19 @@
   // the workload here matches what is expected.
   if (!exportOp.getWorkgroupCount().empty()) {
     auto &workgroupCount = exportOp.getWorkgroupCount();
-    if (workgroupCount.getNumArguments() != workload.size()) {
+    auto explicitArgs = llvm::make_filter_range(
+        workgroupCount.getArgumentTypes(), [](Type type) {
+          return !type.hasTrait<
+              mlir::OpTrait::IREE::Util::ImplicitlyCaptured>();
+        });
+    if (llvm::range_size(explicitArgs) != workload.size()) {
       return op->emitOpError()
              << "workload mismatch; entry point expects "
-             << workgroupCount.getNumArguments()
+             << llvm::range_size(explicitArgs)
              << " arguments but dispatch provides " << workload.size();
     }
-    for (auto [index, expectedType, actualType] : llvm::enumerate(
-             workgroupCount.getArgumentTypes(), workload.getTypes())) {
+    for (auto [index, expectedType, actualType] :
+         llvm::enumerate(explicitArgs, workload.getTypes())) {
       if (expectedType != actualType) {
         return op->emitOpError()
                << "workload operand " << index << " type mismatch; expected "
diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td
index f9e76a8..c15da61 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td
@@ -660,7 +660,7 @@
     // TODO(benvanik): add compatibility and versioning attributes.
   );
 
-  let regions = (region SizedRegion<1>:$body);
+  let regions = (region AnyRegion:$body);
 
   let assemblyFormat = [{
     custom<SymbolVisibility>($sym_visibility)
@@ -676,9 +676,10 @@
 
   let extraClassDeclaration = [{
     Block& getBlock() { return getBody().front(); }
-
     ::mlir::ModuleOp getInnerModule() {
-      return *getBlock().getOps<::mlir::ModuleOp>().begin();
+      auto it = getBlock().getOps<::mlir::ModuleOp>();
+      if (it.empty()) return {};
+      return *it.begin();
     }
   }];
 
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp
index 1d5d4e6..30e32aa 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/AnnotateDispatches.cpp
@@ -326,16 +326,17 @@
     DenseMap<Attribute, SymbolRefAttr> entryPointRefReplacements;
     for (auto executableOp :
          getOperation().getBody()->getOps<IREE::Flow::ExecutableOp>()) {
-      // Rename each export op.
+      auto innerModuleOp = executableOp.getInnerModule();
+      if (!innerModuleOp)
+        continue;
       for (auto exportOp :
            executableOp.getBlock().getOps<ExecutableExportOp>()) {
         auto oldSymbolRefAttr = SymbolRefAttr::get(
             &getContext(), executableOp.getName(),
             {SymbolRefAttr::get(&getContext(), exportOp.getSymName())});
 
-        auto funcOp =
-            executableOp.getInnerModule().lookupSymbol<FunctionOpInterface>(
-                exportOp.getFunctionRef());
+        auto funcOp = innerModuleOp.lookupSymbol<FunctionOpInterface>(
+            exportOp.getFunctionRef());
         if (!funcOp)
           continue; // extern module, maybe
         std::string summary = summarizeDispatchRegion(funcOp.getFunctionBody());
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/DumpDispatchGraph.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/DumpDispatchGraph.cpp
index b741e57..1603d15 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/DumpDispatchGraph.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/DumpDispatchGraph.cpp
@@ -385,6 +385,8 @@
 
     auto calleeNameAttr = entryPoint.getLeafReference();
     auto innerModule = executableOp.getInnerModule();
+    if (!innerModule)
+      return;
     auto funcOps = innerModule.getOps<func::FuncOp>();
     auto funcIt = llvm::find_if(funcOps, [&](func::FuncOp op) {
       return op.getNameAttr() == calleeNameAttr;
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp
index f90df6a..c8d097a 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp
@@ -9,6 +9,7 @@
 #include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
 #include "iree/compiler/Dialect/Flow/Transforms/PassDetail.h"
 #include "iree/compiler/Dialect/Flow/Transforms/Passes.h"
+#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -25,11 +26,15 @@
 namespace Flow {
 namespace {
 
+//===----------------------------------------------------------------------===//
+// flow.dispatch.workgroups
+//===----------------------------------------------------------------------===//
+
 // Creates a flow.executable out of a set of functions, pulling in all other
 // functions reachable by the provided functions.
 static ExecutableOp createExecutable(Location loc, StringRef executableName,
                                      ArrayRef<mlir::func::FuncOp> funcOps,
-                                     ModuleOp parentModuleOp) {
+                                     mlir::ModuleOp parentModuleOp) {
   assert(!funcOps.empty() && "must have at least one entry function");
 
   // Create the executable that will contain the outlined region.
@@ -58,30 +63,30 @@
 }
 
 // Converts a dispatch region op into a dispatch op to the outlined region.
-static LogicalResult convertToDispatchOp(DispatchWorkgroupsOp regionOp,
-                                         ExecutableOp executableOp,
-                                         ExecutableExportOp exportOp) {
+static LogicalResult convertDispatchWorkgroupsToDispatchOp(
+    IREE::Flow::DispatchWorkgroupsOp dispatchWorkgroupsOp,
+    IREE::Flow::ExecutableOp executableOp,
+    IREE::Flow::ExecutableExportOp exportOp) {
   // Insert at the same place as the original region.
-  OpBuilder builder(regionOp);
+  OpBuilder builder(dispatchWorkgroupsOp);
 
   // Create the dispatch op to the executable function.
   // Note that we copy the tied operand indices from the workgroups op - it
   // lines up 1:1 with the dispatch once we've outlined things.
-  auto dispatchOp = builder.create<DispatchOp>(
-      regionOp.getLoc(), exportOp, regionOp.getWorkload(),
-      regionOp.getResultTypes(), regionOp.getResultDims(),
-      regionOp.getArguments(), regionOp.getArgumentDims(),
-      regionOp.getTiedOperandsAttr());
-  dispatchOp->setDialectAttrs(regionOp->getDialectAttrs());
+  auto dispatchOp = builder.create<IREE::Flow::DispatchOp>(
+      dispatchWorkgroupsOp.getLoc(), exportOp,
+      dispatchWorkgroupsOp.getWorkload(), dispatchWorkgroupsOp.getResultTypes(),
+      dispatchWorkgroupsOp.getResultDims(), dispatchWorkgroupsOp.getArguments(),
+      dispatchWorkgroupsOp.getArgumentDims(),
+      dispatchWorkgroupsOp.getTiedOperandsAttr());
+  dispatchOp->setDialectAttrs(dispatchWorkgroupsOp->getDialectAttrs());
 
   // Replace uses of the existing results with the new results.
-  for (int i = 0; i < regionOp.getNumResults(); ++i) {
-    regionOp.getResult(i).replaceAllUsesWith(dispatchOp.getResult(i));
+  for (int i = 0; i < dispatchWorkgroupsOp.getNumResults(); ++i) {
+    dispatchWorkgroupsOp.getResult(i).replaceAllUsesWith(
+        dispatchOp.getResult(i));
   }
 
-  // Erase original region.
-  regionOp.erase();
-
   return success();
 }
 
@@ -113,39 +118,110 @@
 
 // Outlines a dispatch region into a flow.executable and replaces the region op
 // with a dispatch to that outlined executable.
-static LogicalResult
-outlineDispatchWorkgroupsOp(std::string executableOpName,
-                            std::string exportOpName,
-                            DispatchWorkgroupsOp regionOp) {
+static LogicalResult outlineDispatchWorkgroupsOp(
+    std::string name, IREE::Flow::DispatchWorkgroupsOp dispatchWorkgroupsOp) {
   // Convert the region to a free-floating function.
-  auto workgroupFuncOp = createWorkgroupFunc(regionOp.getLoc(), exportOpName,
-                                             regionOp.getWorkgroupBody());
+  auto workgroupFuncOp =
+      createWorkgroupFunc(dispatchWorkgroupsOp.getLoc(), name,
+                          dispatchWorkgroupsOp.getWorkgroupBody());
   if (!workgroupFuncOp) {
     return failure();
   }
 
   // Create the executable with the region cloned into it.
-  auto parentFuncOp = regionOp->getParentOfType<FunctionOpInterface>();
+  auto parentFuncOp =
+      dispatchWorkgroupsOp->getParentOfType<FunctionOpInterface>();
   auto executableOp =
-      createExecutable(regionOp.getLoc(), executableOpName, {workgroupFuncOp},
+      createExecutable(dispatchWorkgroupsOp.getLoc(), name, {workgroupFuncOp},
                        parentFuncOp->getParentOfType<mlir::ModuleOp>());
   executableOp.getOperation()->moveBefore(parentFuncOp);
   executableOp.setPrivate();
 
   // Add an export pointing at the entry point function.
   OpBuilder builder(executableOp.getBody());
-  auto exportOp = builder.create<ExecutableExportOp>(
-      regionOp.getLoc(), workgroupFuncOp.getName(),
+  auto exportOp = builder.create<IREE::Flow::ExecutableExportOp>(
+      dispatchWorkgroupsOp.getLoc(), workgroupFuncOp.getName(),
       SymbolRefAttr::get(workgroupFuncOp));
+  exportOp->setDialectAttrs(dispatchWorkgroupsOp->getDialectAttrs());
 
   // Move over the workgroup count region, if present.
-  if (!regionOp.getWorkgroupCount().empty()) {
-    exportOp.getWorkgroupCount().takeBody(regionOp.getWorkgroupCount());
+  if (!dispatchWorkgroupsOp.getWorkgroupCount().empty()) {
+    exportOp.getWorkgroupCount().takeBody(
+        dispatchWorkgroupsOp.getWorkgroupCount());
   }
-  exportOp->setDialectAttrs(regionOp->getDialectAttrs());
 
   // Finally convert the dispatch region into a dispatch to the outlined func.
-  return convertToDispatchOp(regionOp, executableOp, exportOp);
+  return convertDispatchWorkgroupsToDispatchOp(dispatchWorkgroupsOp,
+                                               executableOp, exportOp);
+}
+
+//===----------------------------------------------------------------------===//
+// hal.dispatch.extern
+//===----------------------------------------------------------------------===//
+
+// Converts a dispatch region op into a dispatch op to the outlined region.
+static LogicalResult
+convertDispatchExternToDispatchOp(IREE::HAL::DispatchExternOp dispatchExternOp,
+                                  IREE::Flow::ExecutableOp executableOp,
+                                  IREE::Flow::ExecutableExportOp exportOp) {
+  // Insert at the same place as the original region.
+  OpBuilder builder(dispatchExternOp);
+
+  // Create the dispatch op to the executable function.
+  // Note that we copy the tied operand indices from the workgroups op - it
+  // lines up 1:1 with the dispatch once we've outlined things.
+  auto dispatchOp = builder.create<IREE::Flow::DispatchOp>(
+      dispatchExternOp.getLoc(), exportOp, dispatchExternOp.getWorkload(),
+      dispatchExternOp.getResultTypes(), dispatchExternOp.getResultDims(),
+      dispatchExternOp.getArguments(), dispatchExternOp.getArgumentDims(),
+      dispatchExternOp.getTiedOperandsAttr());
+  dispatchOp->setDialectAttrs(dispatchExternOp->getDialectAttrs());
+  if (auto bindingsAttr = dispatchExternOp.getBindingsAttr()) {
+    dispatchOp->setAttr("hal.interface.bindings", bindingsAttr);
+  }
+
+  // Replace uses of the existing results with the new results.
+  for (int i = 0; i < dispatchExternOp.getNumResults(); ++i) {
+    dispatchExternOp.getResult(i).replaceAllUsesWith(dispatchOp.getResult(i));
+  }
+
+  return success();
+}
+
+// Outlines a dispatch region into a flow.executable and replaces the region op
+// with a dispatch to that outlined executable.
+static LogicalResult
+outlineDispatchExternOp(std::string name,
+                        IREE::HAL::DispatchExternOp dispatchExternOp) {
+  // Create the executable that will contain the outlined region.
+  // NOTE: this will get uniquified if we have multiple in the same block.
+  auto parentFuncOp = dispatchExternOp->getParentOfType<FunctionOpInterface>();
+  auto parentModuleOp = parentFuncOp->getParentOfType<mlir::ModuleOp>();
+  OpBuilder parentModuleBuilder(&parentModuleOp.getBody()->back());
+  auto executableOp = parentModuleBuilder.create<IREE::Flow::ExecutableOp>(
+      dispatchExternOp.getLoc(), name);
+  executableOp.getOperation()->moveBefore(parentFuncOp);
+  executableOp.setPrivate();
+  executableOp->setAttr("hal.executable.objects",
+                        dispatchExternOp.getObjectsAttr());
+
+  // Add an export pointing at the entry point function.
+  OpBuilder builder(executableOp.getBody());
+  auto exportOp = builder.create<IREE::Flow::ExecutableExportOp>(
+      dispatchExternOp.getLoc(), dispatchExternOp.getExport(),
+      FlatSymbolRefAttr::get(builder.getContext(),
+                             dispatchExternOp.getExport()));
+  exportOp->setDialectAttrs(dispatchExternOp->getDialectAttrs());
+  exportOp->setAttr("hal.interface.layout", dispatchExternOp.getLayoutAttr());
+
+  // Move over the workgroup count region, if present.
+  if (!dispatchExternOp.getWorkgroupCount().empty()) {
+    exportOp.getWorkgroupCount().takeBody(dispatchExternOp.getWorkgroupCount());
+  }
+
+  // Finally convert the dispatch region into a dispatch to the outlined func.
+  return convertDispatchExternToDispatchOp(dispatchExternOp, executableOp,
+                                           exportOp);
 }
 
 } // namespace
@@ -154,41 +230,61 @@
     : public OutlineDispatchRegionsBase<OutlineDispatchRegionsPass> {
 public:
   OutlineDispatchRegionsPass() = default;
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<IREE::Flow::FlowDialect>();
+  }
 
   void runOnOperation() override {
     // Convert each dispatch region into a flow.executable + dispatch op.
     int initializerCount = 0;
-    for (auto it :
-         llvm::enumerate(getOperation().getOps<FunctionOpInterface>())) {
-      FunctionOpInterface op = it.value();
-      Operation *operation = op;
-
-      // Generate a nice name if possible.
+    int funcLikeCount = 0;
+    for (auto funcOp : getOperation().getOps<FunctionOpInterface>()) {
+      // Generate a nice name if possible. All ops we outline in the same scope
+      // will have the same root name.
       std::string namePrefix;
-      if (auto funcOp = llvm::dyn_cast<mlir::func::FuncOp>(operation)) {
-        namePrefix = funcOp.getName().str();
-      } else if (llvm::isa<IREE::Util::InitializerOp>(operation)) {
+      if (isa<IREE::Util::InitializerOp>(funcOp)) {
         namePrefix =
             std::string("_initializer_") + std::to_string(initializerCount++);
       } else {
-        namePrefix =
-            std::string("_function_like_") + std::to_string(it.index());
-      }
-
-      llvm::SmallVector<DispatchWorkgroupsOp> dispatchWorkgroupsOps;
-      // Outline all of the dispatch regions ops in this function.
-      op.walk([&](DispatchWorkgroupsOp op) {
-        dispatchWorkgroupsOps.push_back(op);
-      });
-      for (int i = 0; i < dispatchWorkgroupsOps.size(); ++i) {
-        std::string executableOpName =
-            (namePrefix + "_dispatch_" + llvm::Twine(i)).str();
-        if (failed(outlineDispatchWorkgroupsOp(executableOpName,
-                                               executableOpName,
-                                               dispatchWorkgroupsOps[i]))) {
-          return signalPassFailure();
+        namePrefix = funcOp.getName().str();
+        if (namePrefix.empty()) {
+          namePrefix =
+              std::string("_func_like_") + std::to_string(funcLikeCount++);
         }
       }
+
+      // Outline all of the dispatch regions ops in this function.
+      SmallVector<Operation *> deadOps;
+      auto outlineOps = [&](Operation *op) {
+        return TypeSwitch<Operation *, WalkResult>(op)
+            .Case<IREE::Flow::DispatchWorkgroupsOp>(
+                [&](auto dispatchWorkgroupsOp) {
+                  if (failed(outlineDispatchWorkgroupsOp(
+                          (namePrefix + "_dispatch_" +
+                           llvm::Twine(deadOps.size()))
+                              .str(),
+                          dispatchWorkgroupsOp))) {
+                    return WalkResult::interrupt();
+                  }
+                  deadOps.push_back(op);
+                  return WalkResult::advance();
+                })
+            .Case<IREE::HAL::DispatchExternOp>([&](auto dispatchExternOp) {
+              if (failed(outlineDispatchExternOp(
+                      (namePrefix + "_dispatch_" + llvm::Twine(deadOps.size()))
+                          .str(),
+                      dispatchExternOp))) {
+                return WalkResult::interrupt();
+              }
+              deadOps.push_back(op);
+              return WalkResult::advance();
+            })
+            .Default(WalkResult::advance());
+      };
+      if (funcOp.walk(outlineOps).wasInterrupted())
+        return signalPassFailure();
+      for (auto *deadOp : deadOps)
+        deadOp->erase();
     }
   }
 };
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
index 2500b43..803e369 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
@@ -195,8 +195,8 @@
         return createInitializeEmptyTensorsPass(clZeroFillEmptyTensors);
       });
 
-  // Module pass to outline the dispatch regions into their own functions
-  // wrapped in executables.
+  // Module pass to outline dispatch regions (and similar ops) into their own
+  // functions wrapped in executables.
   passManager.addPass(IREE::Flow::createOutlineDispatchRegionsPass());
 
   // Annotate executables based on their contents.
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir
index 669271f..ec6e497 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir
@@ -164,3 +164,12 @@
     }
   }
 }
+
+// -----
+
+// Executables with no contents are ignored.
+
+flow.executable private @ex {
+  // CHECK: flow.executable.export public @dispatch
+  flow.executable.export public @dispatch
+}
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/outline_dispatch_regions.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/outline_dispatch_regions.mlir
index 4462685..56f05a5 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/outline_dispatch_regions.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/outline_dispatch_regions.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --allow-unregistered-dialect --split-input-file --iree-flow-outline-dispatch-regions %s | FileCheck %s
+// RUN: iree-opt --allow-unregistered-dialect --split-input-file --iree-flow-outline-dispatch-regions --mlir-print-local-scope %s | FileCheck %s
 
 //      CHECK: flow.executable private @staticShapeDispatch_dispatch_0
 // CHECK-NEXT:   flow.executable.export public @staticShapeDispatch_dispatch_0
@@ -176,3 +176,58 @@
   }
   return %0 : tensor<4xi32>
 }
+
+// -----
+
+//      CHECK: flow.executable private @dispatchExtern_dispatch_0
+// CHECK-NEXT:   flow.executable.export public @main
+// CHECK-SAME:     workgroups(%arg0: !hal.device, %arg1: index, %arg2: index) -> (index, index, index) {
+// CHECK-NEXT:       %ok, %value = hal.device.query<%arg0 : !hal.device> key("some" :: "value") : i1, i32
+// CHECK-NEXT:       %0 = arith.index_cast %value : i32 to index
+// CHECK-NEXT:       hal.return %arg1, %arg2, %0 : index, index, index
+// CHECK-NEXT:     } attributes {
+// CHECK-SAME:       hal.interface.layout = #hal.pipeline.layout<push_constants = 1, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>
+// CHECK-SAME:     }
+
+// Demonstrates the full functionality of an extern dispatch op.
+// Note that some fields are optional.
+
+// CHECK-LABEL: func.func @dispatchExtern
+func.func @dispatchExtern(%arg0: tensor<4xi32>, %arg1: tensor<8xi32>, %arg2: i32) -> tensor<8xi32> {
+  %x = arith.constant 100 : index
+  %y = arith.constant 50 : index
+  // Dispatch workgroups to the externally defined function "main" in the
+  // referenced object files.
+  // CHECK: %[[RESULT:.+]] = flow.dispatch @dispatchExtern_dispatch_0::@main[%c100, %c50](%arg0, %arg1, %arg2) {
+  // CHECK-SAME: hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]
+  // CHECK-SAME: } : (tensor<4xi32>, tensor<8xi32>, i32) -> %arg1
+  %result = hal.dispatch.extern "main"[%x, %y](%arg0, %arg1, %arg2) : (tensor<4xi32>, tensor<8xi32>, i32) -> %arg1
+    // Must match the external definition.
+    layout(#hal.pipeline.layout<push_constants = 1, sets = [
+      <0, bindings = [
+          <0, storage_buffer, ReadOnly>,
+          <1, storage_buffer>
+      ]>
+    ]>)
+    // Optional, automatically inferred if omitted.
+    bindings([
+      #hal.interface.binding<0, 0>,
+      #hal.interface.binding<0, 1>
+    ])
+    // Can have object references for multiple targets or configurations.
+    objects(#hal.executable.objects<{
+      #hal.executable.target<"llvm-cpu", "a"> = [#hal.executable.object<{path = "a.o"}>],
+      #hal.executable.target<"llvm-cpu", "b"> = [#hal.executable.object<{path = "b.o"}>]
+    }>)
+    // Translates the workload (%x and %y captured above) into an XYZ workgroup
+    // count, optionally using device information.
+    count(%device: !hal.device, %x_capture: index, %y_capture: index) -> (index, index, index) {
+      // Shows how device queries can be used when computing the workgroup count.
+      // The device is the one used at runtime.
+      %ok, %z_i32 = hal.device.query<%device : !hal.device> key("some" :: "value") : i1, i32
+      %z = arith.index_cast %z_i32 : i32 to index
+      hal.return %x_capture, %y_capture, %z : index, index, index
+    }
+  // CHECK: return %[[RESULT]]
+  return %result : tensor<8xi32>
+}
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Analysis/BindingLayout.cpp b/compiler/src/iree/compiler/Dialect/HAL/Analysis/BindingLayout.cpp
index 7ee3242..f906675 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Analysis/BindingLayout.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Analysis/BindingLayout.cpp
@@ -54,10 +54,60 @@
   return dispatchMap;
 }
 
+// Assumes an explicit layout as specified on an export.
+static PipelineLayout
+assumeExportLayout(IREE::Stream::ExecutableExportOp exportOp,
+                   IREE::HAL::PipelineLayoutAttr layoutAttr) {
+  PipelineLayout pipelineLayout;
+  pipelineLayout.pushConstantCount = layoutAttr.getPushConstants();
+
+  auto setLayoutAttrs = layoutAttr.getSetLayouts();
+  int64_t bindingCount = 0;
+  for (auto setLayoutAttr : setLayoutAttrs) {
+    bindingCount += setLayoutAttr.getBindings().size();
+  }
+
+  pipelineLayout.setLayouts.resize(setLayoutAttrs.size());
+  pipelineLayout.resourceMap.resize(bindingCount);
+  for (auto setLayoutAttr : setLayoutAttrs) {
+    DescriptorSetLayout setLayout;
+    setLayout.ordinal = setLayoutAttr.getOrdinal();
+    setLayout.flags = setLayoutAttr.getFlags().value_or(
+        IREE::HAL::DescriptorSetLayoutFlags::None);
+    auto bindingAttrs = setLayoutAttr.getBindings();
+    setLayout.bindings.resize(bindingAttrs.size());
+    for (auto bindingAttr : bindingAttrs) {
+      DescriptorSetLayoutBinding setBinding;
+      setBinding.ordinal = bindingAttr.getOrdinal();
+      setBinding.type = bindingAttr.getType();
+      setBinding.flags =
+          bindingAttr.getFlags().value_or(IREE::HAL::DescriptorFlags::None);
+      setLayout.bindings[setBinding.ordinal] = setBinding;
+      pipelineLayout.resourceMap.emplace_back(setLayout.ordinal,
+                                              setBinding.ordinal);
+    }
+    pipelineLayout.setLayouts[setLayout.ordinal] = setLayout;
+  }
+
+  LLVM_DEBUG({
+    auto executableOp = exportOp->getParentOfType<IREE::Stream::ExecutableOp>();
+    llvm::dbgs() << "assumeExportLayout(@" << executableOp.getSymName() << "::@"
+                 << exportOp.getSymName() << "):\n";
+    pipelineLayout.print(llvm::dbgs());
+  });
+
+  return pipelineLayout;
+}
+
 // Derives an pipeline layout from all of the dispatches to |exportOp|.
 static PipelineLayout
 deriveExportLayout(IREE::Stream::ExecutableExportOp exportOp,
                    SmallVector<IREE::Stream::CmdDispatchOp> &dispatchOps) {
+  if (auto layoutAttr = exportOp->getAttrOfType<IREE::HAL::PipelineLayoutAttr>(
+          "hal.interface.layout")) {
+    return assumeExportLayout(exportOp, layoutAttr);
+  }
+
   auto funcOp = exportOp.lookupFunctionRef();
   assert(funcOp && "export target not found");
 
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td b/compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td
index 986356f..5f5a878 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALBase.td
@@ -669,6 +669,10 @@
   }];
 }
 
+def HAL_InterfaceBindingArrayAttr :
+    TypedArrayAttrBase<HAL_InterfaceBindingAttr,
+                       "HAL binding array attribute">;
+
 //===----------------------------------------------------------------------===//
 // Device and executable target specification
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp
index 11cbbae..2a07681 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp
@@ -114,6 +114,68 @@
 }
 
 //===----------------------------------------------------------------------===//
+// custom<WorkgroupCountRegion>($body)
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseWorkgroupCountRegion(OpAsmParser &parser,
+                                             Region &body) {
+  SmallVector<OpAsmParser::Argument> args;
+  if (failed(parser.parseArgumentList(args, AsmParser::Delimiter::Paren,
+                                      /*allowType=*/true,
+                                      /*allowAttrs=*/true))) {
+    return failure();
+  }
+
+  // Return types must be 3 dimensions (workgroup count XYZ).
+  SmallVector<Type> returnTypes;
+  if (failed(parser.parseArrowTypeList(returnTypes))) {
+    return failure();
+  }
+  if (returnTypes.size() != 3 ||
+      !llvm::all_of(returnTypes, [](Type type) { return type.isIndex(); })) {
+    return parser.emitError(parser.getCurrentLocation())
+           << "workgroup count region must return the XYZ dimension counts";
+  }
+
+  // Parse region contents.
+  if (failed(parser.parseRegion(body, args, /*enableNameShadowing=*/false))) {
+    return failure();
+  }
+
+  // Verify the return types match.
+  for (auto returnOp : body.getOps<IREE::HAL::ReturnOp>()) {
+    for (auto [resultType, returnType] :
+         llvm::zip_equal(returnTypes, returnOp.getOperandTypes())) {
+      if (resultType != returnType) {
+        return returnOp.emitOpError()
+               << "operands do not match expected region return types";
+      }
+    }
+  }
+
+  return success();
+}
+
+static void printWorkgroupCountRegion(OpAsmPrinter &p, Operation *op,
+                                      Region &body) {
+  if (body.empty())
+    return;
+  p << "(";
+  auto args = body.getArguments();
+  for (unsigned i = 0; i < args.size(); ++i) {
+    if (i > 0)
+      p << ", ";
+    p.printRegionArgument(args[i]);
+  }
+  p << ")";
+  Type indexType = IndexType::get(body.getContext());
+  p.printArrowTypeList(TypeRange{indexType, indexType, indexType});
+  p << " ";
+  p.printRegion(body, /*printEntryBlockArgs=*/false,
+                /*printBlockTerminators=*/true);
+}
+
+//===----------------------------------------------------------------------===//
 // hal.ex.*
 //===----------------------------------------------------------------------===//
 
@@ -320,6 +382,136 @@
 }
 
 //===----------------------------------------------------------------------===//
+// hal.dispatch.extern
+//===----------------------------------------------------------------------===//
+
+void DispatchExternOp::build(OpBuilder &builder, OperationState &state,
+                             ValueRange workload, TypeRange resultTypes,
+                             ValueRange resultDims, ValueRange arguments,
+                             ValueRange argumentDims,
+                             ArrayRef<int64_t> tiedOperands,
+                             ArrayRef<NamedAttribute> attributes) {
+  state.addTypes(resultTypes);
+  state.addOperands(workload);
+  state.addOperands(arguments);
+  state.addOperands(argumentDims);
+  state.addOperands(resultDims);
+  state.addAttributes(attributes);
+  state.attributes.erase(IREE::Util::TiedOpInterface::getStorageAttrName());
+  state.addAttribute(IREE::Util::TiedOpInterface::getStorageAttrName(),
+                     builder.getIndexArrayAttr(tiedOperands));
+  state.attributes.erase(getOperandSegmentSizeAttr());
+  state.addAttribute(getOperandSegmentSizeAttr(),
+                     builder.getDenseI32ArrayAttr({
+                         static_cast<int32_t>(workload.size()),
+                         static_cast<int32_t>(arguments.size()),
+                         static_cast<int32_t>(argumentDims.size()),
+                         static_cast<int32_t>(resultDims.size()),
+                     }));
+
+  llvm::BitVector operandAliases(llvm::size(arguments), false);
+  llvm::BitVector resultAliases(llvm::size(resultTypes), false);
+  for (unsigned resultIndex = 0; resultIndex < tiedOperands.size();
+       ++resultIndex) {
+    int64_t tiedOperandIndex = tiedOperands[resultIndex];
+    if (tiedOperandIndex != IREE::Util::TiedOpInterface::kUntiedIndex) {
+      operandAliases[tiedOperandIndex] = true;
+      resultAliases[resultIndex] = true;
+    }
+  }
+
+  // NOTE: workgroup count region is empty; callers are expected to populate it.
+  state.addRegion();
+}
+
+// Verifies that |dynamicDims| contains the appropriate number of dims for all
+// of the dynamic dimensions in |values|.
+static LogicalResult verifyOpDynamicDims(Operation *op, ValueRange values,
+                                         ValueRange dynamicDims) {
+  unsigned requiredCount = 0;
+  for (auto value : values) {
+    if (auto shapedType = llvm::dyn_cast<ShapedType>(value.getType())) {
+      requiredCount += shapedType.getNumDynamicDims();
+    }
+  }
+  if (dynamicDims.size() != requiredCount) {
+    return op->emitOpError()
+           << "value set has " << requiredCount
+           << " dynamic dimensions but only " << dynamicDims.size()
+           << " dimension values are attached";
+  }
+  return success();
+}
+
+static LogicalResult
+verifyWorkgroupCountRegion(Operation *op, ValueRange workload, Region &region) {
+  // Verify the workload operands match the expected capture args.
+  auto regionArguments =
+      llvm::make_filter_range(region.getArgumentTypes(), [](Type type) {
+        return !type.isa<IREE::HAL::DeviceType>();
+      });
+  if (workload.size() != llvm::range_size(regionArguments)) {
+    return op->emitOpError()
+           << "workload operands and workgroup count args mismatch ("
+           << workload.size() << " vs " << llvm::range_size(regionArguments)
+           << ")";
+  }
+  for (auto [index, values] :
+       llvm::enumerate(llvm::zip_equal(workload, regionArguments))) {
+    auto [workloadValue, capturedType] = values;
+    if (workloadValue.getType() != capturedType) {
+      return op->emitOpError()
+             << "workload value " << index << " type mismatch; operand is "
+             << workloadValue.getType() << " but region captures "
+             << capturedType;
+    }
+  }
+  return success();
+}
+
+LogicalResult DispatchExternOp::verify() {
+  Operation *op = getOperation();
+
+  if (failed(verifyOpDynamicDims(getOperation(), getArguments(),
+                                 getArgumentDims())) ||
+      failed(
+          verifyOpDynamicDims(getOperation(), getResults(), getResultDims()))) {
+    return failure();
+  }
+
+  auto verifyIOType = [&](Type type) -> LogicalResult {
+    if (auto shapedType = llvm::dyn_cast<ShapedType>(type)) {
+      if (shapedType.getElementType().isIndex()) {
+        return op->emitOpError() << "I/O type " << type
+                                 << " is invalid: index types must not cross "
+                                    "the dispatch boundary";
+      }
+    }
+    return success();
+  };
+  for (auto type : getOperandTypes()) {
+    if (failed(verifyIOType(type)))
+      return failure();
+  }
+  for (auto type : getResultTypes()) {
+    if (failed(verifyIOType(type)))
+      return failure();
+  }
+
+  if (failed(
+          verifyWorkgroupCountRegion(op, getWorkload(), getWorkgroupCount()))) {
+    return failure();
+  }
+
+  return success();
+}
+
+std::pair<unsigned, unsigned>
+DispatchExternOp::getTiedOperandsIndexAndLength() {
+  return getODSOperandIndexAndLength(1);
+}
+
+//===----------------------------------------------------------------------===//
 // hal.allocator.allocate
 //===----------------------------------------------------------------------===//
 
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
index 8e849dd..bc58f6f 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
@@ -289,6 +289,151 @@
   let hasCanonicalizer = 1;
 }
 
+def HAL_DispatchExternOp : HAL_PureOp<"dispatch.extern", [
+  IsolatedFromAbove,
+  AttrSizedOperandSegments,
+  DeclareOpInterfaceMethods<Util_TiedOpInterface, [
+    "getTiedOperandsIndexAndLength",
+  ]>,
+  Util_ShapeAwareOp,
+]> {
+  let summary = [{a dispatch of workgroups across a 3-dimensional grid}];
+  let description = [{
+    Dispatches some number of workgroups across a 3-dimensional grid using a
+    function defined externally in one or more referenced objects. Objects are
+    declared per executable target and selected automatically during linking
+    based on where the dispatch is used. Semantically this is equivalent to
+    a `flow.dispatch.workgroups` but with the workgroup region invisible to the
+    compiler. See `hal.executable` for more information about object linkage.
+
+    Note that since this happens at tensor level the dispatch operation has
+    value semantics: some tensors (and optionally other primitive types) are
+    consumed and one or more new result tensors are produced. Inside each
+    workgroup, however, the input and output tensors are available for arbitrary
+    loads and stores. In many cases each workgroup will load some particular
+    tile(s) from the input tensors and store some particular tile(s) to the
+    output tensors unique to that workgroup. Though it's possible for multiple
+    workgroups to load the same regions of the input tensors behavior is
+    undefined if multiple workgroups store to the same regions of the output
+    tensors. Codegen guarantees this behavior but when sourcing externally
+    authored dispatch functions it's critical that this behavior is observed.
+
+    Though the representation is similar to the GPU-style grid dispatch model
+    here we still have not yet allocated buffers, determined the target device
+    for execution, or even completed fully resolving shapes/types/etc. Because
+    of this it's important that the workgroup body use the platform-dependent
+    primitives for accessing workgroup ID, size, and count intrinsics instead
+    of hardcoding them to a particular set of values. Assume that any workgroup
+    dispatch may end up being specialized for several different target devices
+    and even several different variants for a particular target device
+    (differing workgroup sizes, etc). To aid deduplication code producing these
+    external dispatches should try not to specialize early for particular shapes
+    and instead emit the most generic code possible as having 500 slightly
+    different `hal.dispatch.extern` ops pointing at the same object file is
+    likely to require 500 copies of the object instead of 500 calls to the same
+    object.
+
+    Because at this point in the layering devices have not yet been selected the
+    workgroup count cannot be fully evaluated. Instead workload parameters are
+    captured that are then passed to a function that when later evaluated
+    computes the actual workgroup count based on target information. The
+    workload is not limited to the 3D XYZ grid dispatch of the workgroup count
+    and can contain any number of parameters used to compute it. If workgroup
+    size or distribution varies based on the target device a `!hal.device`
+    argument can be used by the workgroup count calculation region to factor in
+    device parameters. See `hal.device.query` for more information on how to
+    query information.
+
+    ```mlir
+    %r = hal.dispatch.extern "some_function"[%c5, %c5](%0, %1)
+        : (tensor<5x5xf32>, tensor<5xf32>) -> tensor<5x5xf32>
+      ...
+    ```
+
+    The number of results of the operation is equal to the number of results
+    in the type signature (`(tensor<5x5xf32>, tensor<5xf32>) -> tensor<5x5xf32>`).
+    Each tensor argument and result in the type signature has a corresponding
+    pipeline layout slot and must be declared. If multiple arguments or results
+    share the same layout slot they can be aliased using the `bindings`
+    attribute and otherwise each is assumed unique.
+
+    There are no `arguments` operands for results, but a result can be tied an
+    argument by writing the argument operand's SSA value instead of its type:
+    E.g., in the above example, `-> %0` would tie the first argument to the
+    result. In that case, there would be no separate block argument for the
+    result.
+  }];
+
+  let arguments = (ins
+    StrAttr:$export,
+    Variadic<Index>:$workload,
+    Variadic<AnyType>:$arguments,
+    HAL_ShapeDynamicDims:$argument_dims,
+    HAL_ShapeDynamicDims:$result_dims,
+    HAL_PipelineLayoutAttr:$layout,
+    HAL_ExecutableObjectsAttr:$objects,
+    OptionalAttr<HAL_WorkgroupSizeAttr>:$workgroup_size,
+    OptionalAttr<HAL_SubgroupSizeAttr>:$subgroup_size,
+    OptionalAttr<IndexAttr>:$workgroup_local_memory,
+    OptionalAttr<HAL_InterfaceBindingArrayAttr>:$bindings,
+    OptionalAttr<Util_TiedOpStorageAttr>:$tied_operands
+  );
+  let results = (outs
+    Variadic<AnyType>:$results
+  );
+
+  let regions = (region
+    AnyRegion:$workgroup_count
+  );
+
+  let assemblyFormat = [{
+    $export
+    (`[` $workload^ `]`)? ``
+    `(` $arguments `)` `:`
+    custom<ShapedFunctionType>(ref($arguments),
+                               type($arguments), $argument_dims,
+                               type($results), $result_dims,
+                               $tied_operands)
+    `layout` `(` $layout `)`
+    (`bindings` `(` $bindings^ `)`)?
+    `objects` `(` $objects `)`
+    `count` `` custom<WorkgroupCountRegion>($workgroup_count)
+    attr-dict-with-keyword
+  }];
+
+  let skipDefaultBuilders = 1;
+  let builders = [
+    OpBuilder<(ins
+      "ValueRange":$workload,
+      "TypeRange":$resultTypes, "ValueRange":$resultDims,
+      "ValueRange":$arguments, "ValueRange":$argumentDims,
+      "ArrayRef<int64_t>":$tiedOperands,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>,
+  ];
+
+  let extraClassDeclaration = [{
+    FunctionType getDispatchType() {
+      return FunctionType::get(
+          getContext(), llvm::map_to_vector(getArguments(), [](Value value) {
+            return value.getType();
+          }),
+          getResultTypes());
+    }
+
+    /// Returns the index of the args() operand in the Operation operands list.
+    unsigned mapArgOperandToOpOperand(unsigned i) { return i + getWorkload().size(); };
+
+    ValueRange getOperandDynamicDims(unsigned idx) {
+      return IREE::Util::findVariadicDynamicDims(idx - getWorkload().size(), getArguments(), getArgumentDims());
+    }
+    ValueRange getResultDynamicDims(unsigned idx) {
+      return IREE::Util::findVariadicDynamicDims(idx, getResults(), getResultDims());
+    }
+  }];
+
+  let hasVerifier = 1;
+}
+
 } // OpGroupPseudoOps
 
 //===----------------------------------------------------------------------===//
@@ -1827,9 +1972,9 @@
     }
 
     ::mlir::ModuleOp getInnerModule() {
-      auto moduleOps = getBlock().getOps<::mlir::ModuleOp>();
-      assert(!moduleOps.empty() && "source ops need inner modules");
-      return *moduleOps.begin();
+      auto it = getBlock().getOps<::mlir::ModuleOp>();
+      if (it.empty()) return {};
+      return *it.begin();
     }
   }];
 }
@@ -1902,11 +2047,10 @@
     An entry point exported by the executable with statically-available
     information describing the IO interface it uses and other dispatch metadata.
 
-    The `calculate_workgroup_count` region represents the computation that
+    The `workgroup_count` region represents the computation that
     returns the number of workgroups to use in the 3D grid dispatch.
     The arguments to the region represents the workload as captured by each
-    dispatch.
-    It returns the number of workgroups along x, y, and z.
+    dispatch. It returns the number of workgroups along x, y, and z.
   }];
 
   let arguments = (ins
@@ -2002,9 +2146,9 @@
     }
 
     ::mlir::ModuleOp getInnerModule() {
-      auto moduleOps = getBlock().getOps<::mlir::ModuleOp>();
-      assert(!moduleOps.empty() && "source ops need inner modules");
-      return *moduleOps.begin();
+      auto it = getBlock().getOps<::mlir::ModuleOp>();
+      if (it.empty()) return {};
+      return *it.begin();
     }
 
     // Returns a map of constant key attributes to ordinals across all constant
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALTypes.h b/compiler/src/iree/compiler/Dialect/HAL/IR/HALTypes.h
index 04e0c2d..665625e 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALTypes.h
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALTypes.h
@@ -11,6 +11,7 @@
 #include <optional>
 
 #include "iree/compiler/Dialect/Stream/IR/StreamTypes.h"
+#include "iree/compiler/Dialect/Util/IR/UtilTraits.h"
 #include "iree/compiler/Dialect/Util/IR/UtilTypes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
@@ -109,7 +110,9 @@
   using Base::Base;
 };
 
-struct DeviceType : public Type::TypeBase<DeviceType, Type, TypeStorage> {
+struct DeviceType
+    : public Type::TypeBase<DeviceType, Type, TypeStorage,
+                            mlir::OpTrait::IREE::Util::ImplicitlyCaptured> {
   using Base::Base;
 };
 
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/tensor_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/tensor_ops.mlir
index 13d238c..86b81c9 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/tensor_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/tensor_ops.mlir
@@ -51,3 +51,44 @@
   %0:2 = hal.tensor.barrier join(%arg0, %arg1 : tensor<3xf32>, tensor<4xf32>) => %arg2 : !hal.fence
   return %0#0, %0#1 : tensor<3xf32>, tensor<4xf32>
 }
+
+// -----
+
+// Demonstrates the full functionality of an extern dispatch op.
+// Note that some fields are optional.
+
+// CHECK-LABEL: func.func @dispatchExtern
+func.func @dispatchExtern(%arg0: tensor<4xi32>, %arg1: tensor<8xi32>, %arg2: i32) -> tensor<8xi32> {
+  %x = arith.constant 100 : index
+  %y = arith.constant 50 : index
+  // Dispatch workgroups to the externally defined function "main" in the
+  // referenced object files.
+  %0 = hal.dispatch.extern "main"[%x, %y](%arg0, %arg1, %arg2) : (tensor<4xi32>, tensor<8xi32>, i32) -> %arg1
+    // Must match the external definition.
+    layout(#hal.pipeline.layout<push_constants = 1, sets = [
+      <0, bindings = [
+          <0, storage_buffer, ReadOnly>,
+          <1, storage_buffer>
+      ]>
+    ]>)
+    // Optional, automatically inferred if omitted.
+    bindings([
+      #hal.interface.binding<0, 0>,
+      #hal.interface.binding<0, 1>
+    ])
+    // Can have object references for multiple targets or configurations.
+    objects(#hal.executable.objects<{
+      #hal.executable.target<"llvm-cpu", "a"> = [#hal.executable.object<{path = "a.o"}>],
+      #hal.executable.target<"llvm-cpu", "b"> = [#hal.executable.object<{path = "b.o"}>]
+    }>)
+    // Translates the workload (%x and %y captured above) into an XYZ workgroup
+    // count, optionally using device information.
+    count(%device: !hal.device, %x_capture: index, %y_capture: index) -> (index, index, index) {
+      // Shows how device queries can be used when computing the workgroup count.
+      // The device is the one used at runtime.
+      %ok, %z_i32 = hal.device.query<%device : !hal.device> key("some" :: "value") : i1, i32
+      %z = arith.index_cast %z_i32 : i32 to index
+      hal.return %x_capture, %y_capture, %z : index, index, index
+    }
+  return %0 : tensor<8xi32>
+}
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/LLVMCPUTarget.cpp b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/LLVMCPUTarget.cpp
index 90a6f45..7b2e481 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/LLVMCPUTarget.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMCPU/LLVMCPUTarget.cpp
@@ -350,6 +350,8 @@
     for (auto exportOp : variantOp.getBlock().getOps<ExecutableExportOp>()) {
       // Find the matching function in the LLVM module.
       auto *llvmFunc = llvmModule->getFunction(exportOp.getName());
+      if (!llvmFunc)
+        continue;
       llvmFunc->setLinkage(llvm::GlobalValue::LinkageTypes::InternalLinkage);
       llvmFunc->setDSOLocal(true);
 
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeInterfaces.cpp b/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeInterfaces.cpp
index 97c63a4..63cf4b7 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeInterfaces.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeInterfaces.cpp
@@ -234,7 +234,7 @@
 // and use the HAL interface access primitives.
 static mlir::func::FuncOp
 cloneFuncWithInterface(mlir::func::FuncOp sourceFuncOp,
-                       const PipelineLayout &pipelineLayout,
+                       const PipelineResourceMap &resourceMap,
                        IREE::HAL::PipelineLayoutAttr layoutAttr) {
   // Clone so that we can do a bunch of unsafe in-place updates.
   auto clonedFuncOp = sourceFuncOp.clone();
@@ -258,7 +258,7 @@
   for (auto arg : entryBlock->getArguments()) {
     if (!llvm::isa<IREE::Stream::BindingType>(arg.getType()))
       continue;
-    auto setBinding = pipelineLayout.resourceMap[resourceIdx++];
+    auto setBinding = resourceMap[resourceIdx++];
     auto setLayoutAttr = layoutAttr.getSetLayouts()[setBinding.first];
     auto bindingAttr = setLayoutAttr.getBindings()[setBinding.second];
     convertBindingUsage(sourceFuncOp, arg, setLayoutAttr, bindingAttr);
@@ -294,6 +294,9 @@
 // TODO(benvanik): have a HAL op with structured information instead.
 static void annotateDispatchSite(IREE::Stream::CmdDispatchOp dispatchOp,
                                  const PipelineResourceMap &resourceMap) {
+  // Ignore if bindings already defined.
+  if (dispatchOp->hasAttr("hal.interface.bindings"))
+    return;
   SmallVector<Attribute> bindingAttrs;
   for (auto setBinding : resourceMap) {
     bindingAttrs.push_back(IREE::HAL::InterfaceBindingAttr::get(
@@ -311,7 +314,6 @@
                      IREE::HAL::ExecutableOp targetExecutableOp,
                      const BindingLayoutAnalysis &layoutAnalysis,
                      EntryPointExpansions &entryPointExpansions) {
-  auto sourceModuleOp = sourceExecutableOp.getInnerModule();
   auto variantOps =
       targetExecutableOp.getBlock().getOps<IREE::HAL::ExecutableVariantOp>();
   OpBuilder executableBuilder(&targetExecutableOp.getBlock().front());
@@ -323,27 +325,37 @@
   int nextOrdinal = 0;
   for (auto exportOp : sourceExecutableOp.getBody()
                            .getOps<IREE::Stream::ExecutableExportOp>()) {
-    auto sourceFuncOp = sourceModuleOp.lookupSymbol<mlir::func::FuncOp>(
-        exportOp.getFunctionRef());
-    if (failed(verifyEntryPointTypes(sourceFuncOp)))
-      return failure();
+    func::FuncOp sourceFuncOp; // optional, may be extern
+    if (auto sourceModuleOp = sourceExecutableOp.getInnerModule()) {
+      sourceFuncOp = sourceModuleOp.lookupSymbol<mlir::func::FuncOp>(
+          exportOp.getFunctionRef());
+      if (failed(verifyEntryPointTypes(sourceFuncOp)))
+        return failure();
+    }
 
-    // Create the interface for this entry point based on the analysis of its
-    // usage within the program.
+    // Lookup to see if a layout was specified already. If not we'll perform
+    // some basic analysis to come up with our own layout.
+    auto forcedLayoutAttr =
+        exportOp->getAttrOfType<IREE::HAL::PipelineLayoutAttr>(
+            "hal.interface.layout");
     const auto &pipelineLayout = layoutAnalysis.getPipelineLayout(exportOp);
+    const PipelineResourceMap &resourceMap = pipelineLayout.resourceMap;
 
     // Update all dispatch sites with the binding information required for
     // conversion into the HAL dialect. By doing this here we ensure that the
     // dialect conversion needs only local information on the ops and that it's
     // not possible for the dispatches and their targets to get out of sync.
     for (auto dispatchOp : layoutAnalysis.getExportDispatches(exportOp)) {
-      annotateDispatchSite(dispatchOp, pipelineLayout.resourceMap);
+      annotateDispatchSite(dispatchOp, resourceMap);
     }
 
     // Clone the updated function declaration into each variant.
     int ordinal = nextOrdinal++;
     for (auto variantOp : variantOps) {
-      OpBuilder targetBuilder(variantOp.getInnerModule());
+      auto targetBuilder = OpBuilder::atBlockBegin(&variantOp.getBlock());
+
+      // TODO(ravishankarm): use hal.interface.workgroup_size instead of codegen
+      // attributes.
       // Check if workgroup size is set externally.
       ArrayAttr workgroupSize;
       for (auto attr : exportOp->getAttrs()) {
@@ -362,12 +374,15 @@
       }
 
       // Declare the entry point on the target.
-      auto layoutAttr = makePipelineLayoutAttr(
-          pipelineLayout, variantOp.getTargetAttr(), targetBuilder);
+      auto variantLayoutAttr =
+          forcedLayoutAttr ? forcedLayoutAttr
+                           : makePipelineLayoutAttr(pipelineLayout,
+                                                    variantOp.getTargetAttr(),
+                                                    targetBuilder);
       auto newExportOp = targetBuilder.create<IREE::HAL::ExecutableExportOp>(
           exportOp.getLoc(),
           targetBuilder.getStringAttr(exportOp.getFunctionRef()),
-          targetBuilder.getIndexAttr(ordinal), layoutAttr, workgroupSize,
+          targetBuilder.getIndexAttr(ordinal), variantLayoutAttr, workgroupSize,
           /*subgroup_size=*/IntegerAttr{},
           /*workgroup_local_memory=*/IntegerAttr{});
 
@@ -383,16 +398,21 @@
         mlir::IRMapping mapper;
         exportOp.getWorkgroupCount().cloneInto(&newExportOp.getWorkgroupCount(),
                                                mapper);
-        // Insert the !hal.device argument.
+        // Insert the !hal.device argument if it doesn't already exist.
         Type deviceType = targetBuilder.getType<IREE::HAL::DeviceType>();
-        newExportOp.getWorkgroupCount().insertArgument(0u, deviceType,
-                                                       newExportOp.getLoc());
+        if (!llvm::is_contained(exportOp.getWorkgroupCount().getArgumentTypes(),
+                                deviceType)) {
+          newExportOp.getWorkgroupCount().insertArgument(0u, deviceType,
+                                                         newExportOp.getLoc());
+        }
       }
 
       // Clone the source function and update it to use the new interface.
-      auto variantFuncOp =
-          cloneFuncWithInterface(sourceFuncOp, pipelineLayout, layoutAttr);
-      targetFuncOps[sourceFuncOp][variantOp] = variantFuncOp;
+      if (sourceFuncOp) {
+        auto variantFuncOp = cloneFuncWithInterface(sourceFuncOp, resourceMap,
+                                                    variantLayoutAttr);
+        targetFuncOps[sourceFuncOp][variantOp] = variantFuncOp;
+      }
     }
   }
 
@@ -403,18 +423,20 @@
   // functions and multiple exports (with an N:M mapping) and in this way we
   // perform the variant construction in a single pass with deterministic
   // ordering that preserves the unmodified ops.
-  for (auto variantOp : variantOps) {
-    auto targetBuilder = OpBuilder::atBlockBegin(
-        &variantOp.getInnerModule().getBodyRegion().front());
-    for (auto &op : sourceModuleOp.getOps()) {
-      auto targetVariantFuncOps = targetFuncOps.find(&op);
-      if (targetVariantFuncOps != targetFuncOps.end()) {
-        // Move the updated function into place.
-        auto variantFuncOp = targetVariantFuncOps->second[variantOp];
-        targetBuilder.insert(variantFuncOp);
-      } else {
-        // Regular op (globals, external function declarations, etc).
-        targetBuilder.clone(op);
+  if (auto sourceModuleOp = sourceExecutableOp.getInnerModule()) {
+    for (auto variantOp : variantOps) {
+      auto targetBuilder = OpBuilder::atBlockBegin(
+          &variantOp.getInnerModule().getBodyRegion().front());
+      for (auto &op : sourceModuleOp.getOps()) {
+        auto targetVariantFuncOps = targetFuncOps.find(&op);
+        if (targetVariantFuncOps != targetFuncOps.end()) {
+          // Move the updated function into place.
+          auto variantFuncOp = targetVariantFuncOps->second[variantOp];
+          targetBuilder.insert(variantFuncOp);
+        } else {
+          // Regular op (globals, external function declarations, etc).
+          targetBuilder.clone(op);
+        }
       }
     }
   }
@@ -565,8 +587,10 @@
                 targetAttr);
         setApplicableObjects(sourceOp, targetContainerOp);
         targetSymbolTable.insert(targetContainerOp);
-        OpBuilder containerBuilder(&targetContainerOp.getBlock().back());
-        containerBuilder.create<mlir::ModuleOp>(sourceOp->getLoc());
+        if (sourceOp.getInnerModule()) {
+          OpBuilder containerBuilder(&targetContainerOp.getBlock().back());
+          containerBuilder.create<mlir::ModuleOp>(sourceOp->getLoc());
+        }
       }
 
       // Define interfaces for each exported function based on analysis.
@@ -596,10 +620,6 @@
       // Annotate the dispatch site with binding information if required.
       // TODO(benvanik): remove this path; shouldn't be needed in real usage.
       // Because this is a hack we just look for the first target entry point.
-      if (dispatchOp->hasAttr("hal.interface.bindings")) {
-        // Already have bindings defined.
-        return WalkResult::advance();
-      }
       PipelineResourceMap resourceMap;
       auto anyEntryPointAttr = *dispatchOp.getEntryPointRefs().begin();
       auto anyExportOp =
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp
index 0ed4955..297eba6 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp
@@ -785,47 +785,50 @@
     }
 
     // Move the original nested module body into the new executable directly.
-    auto moduleOp = rewriter.cloneWithoutRegions(flowOp.getInnerModule());
-    streamOp.getInnerModule().getBodyRegion().takeBody(
-        flowOp.getInnerModule().getBodyRegion());
+    if (auto innerModuleOp = flowOp.getInnerModule()) {
+      auto moduleOp = rewriter.cloneWithoutRegions(innerModuleOp);
+      streamOp.getInnerModule().getBodyRegion().takeBody(
+          flowOp.getInnerModule().getBodyRegion());
 
-    // Update the entry point signatures in the module.
-    // Dispatch tensor arguments become bindings and all others are preserved as
-    // adaptor. Note that we only touch public (exported) functions.
-    for (auto funcOp : moduleOp.getOps<mlir::func::FuncOp>()) {
-      if (!funcOp.isPublic())
-        continue;
+      // Update the entry point signatures in the module.
+      // Dispatch tensor arguments become bindings and all others are preserved
+      // as adaptor. Note that we only touch public (exported) functions.
+      for (auto funcOp : moduleOp.getOps<mlir::func::FuncOp>()) {
+        if (!funcOp.isPublic())
+          continue;
 
-      SmallVector<Type> newTypes;
-      newTypes.reserve(funcOp.getNumArguments());
-      assert(funcOp.getNumResults() == 0 && "flow dispatches have no results");
+        SmallVector<Type> newTypes;
+        newTypes.reserve(funcOp.getNumArguments());
+        assert(funcOp.getNumResults() == 0 &&
+               "flow dispatches have no results");
 
-      rewriter.setInsertionPointToStart(&funcOp.front());
-      auto zero = rewriter.create<arith::ConstantIndexOp>(funcOp.getLoc(), 0);
-      for (auto arg : funcOp.front().getArguments()) {
-        auto oldType = arg.getType();
-        if (auto tensorType =
-                llvm::dyn_cast<IREE::Flow::DispatchTensorType>(oldType)) {
-          // Now a binding - insert the stream.binding.subspan op to slice it.
-          auto newType = rewriter.getType<IREE::Stream::BindingType>();
-          newTypes.push_back(newType);
-          if (!insertBindingOp(arg, tensorType, zero, rewriter)) {
-            return rewriter.notifyMatchFailure(
-                flowOp, "failed to query dynamic dimensions");
+        rewriter.setInsertionPointToStart(&funcOp.front());
+        auto zero = rewriter.create<arith::ConstantIndexOp>(funcOp.getLoc(), 0);
+        for (auto arg : funcOp.front().getArguments()) {
+          auto oldType = arg.getType();
+          if (auto tensorType =
+                  llvm::dyn_cast<IREE::Flow::DispatchTensorType>(oldType)) {
+            // Now a binding - insert the stream.binding.subspan op to slice it.
+            auto newType = rewriter.getType<IREE::Stream::BindingType>();
+            newTypes.push_back(newType);
+            if (!insertBindingOp(arg, tensorType, zero, rewriter)) {
+              return rewriter.notifyMatchFailure(
+                  flowOp, "failed to query dynamic dimensions");
+            }
+            arg.setType(newType);
+          } else {
+            // Preserved - will eventually be a push constants.
+            newTypes.push_back(oldType);
           }
-          arg.setType(newType);
-        } else {
-          // Preserved - will eventually be a push constants.
-          newTypes.push_back(oldType);
         }
+
+        // Strip any shape ties now that we've extracted the information.
+        funcOp.walk([&](IREE::Flow::DispatchTieShapeOp tieOp) {
+          rewriter.replaceOp(tieOp, tieOp.getOperand());
+        });
+
+        funcOp.setType(rewriter.getFunctionType(newTypes, {}));
       }
-
-      // Strip any shape ties now that we've extracted the information.
-      funcOp.walk([&](IREE::Flow::DispatchTieShapeOp tieOp) {
-        rewriter.replaceOp(tieOp, tieOp.getOperand());
-      });
-
-      funcOp.setType(rewriter.getFunctionType(newTypes, {}));
     }
 
     rewriter.eraseOp(flowOp);
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/executable_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/executable_ops.mlir
index e80095e..36305db 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/executable_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/executable_ops.mlir
@@ -1,5 +1,14 @@
 // RUN: iree-opt --split-input-file --iree-stream-conversion --canonicalize %s | FileCheck %s
 
+// CHECK-LABEL: @extern_executable
+flow.executable private @extern_executable {
+  // CHECK: stream.executable.export public @dispatch
+  flow.executable.export public @dispatch
+  // CHECK-NOT: builtin.module
+}
+
+// -----
+
 // CHECK-LABEL: @workgroup_count_region
 flow.executable private @workgroup_count_region {
   // CHECK-NEXT: stream.executable.export public @dispatch
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
index f7d30af..8d6db37 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
@@ -57,14 +57,19 @@
   // the workload here matches what is expected.
   if (!exportOp.getWorkgroupCount().empty()) {
     auto &workgroupCount = exportOp.getWorkgroupCount();
-    if (workgroupCount.getNumArguments() != workload.size()) {
+    auto explicitArgs = llvm::make_filter_range(
+        workgroupCount.getArgumentTypes(), [](Type type) {
+          return !type.hasTrait<
+              mlir::OpTrait::IREE::Util::ImplicitlyCaptured>();
+        });
+    if (llvm::range_size(explicitArgs) != workload.size()) {
       return op->emitOpError()
              << "workload mismatch; entry point expects "
-             << workgroupCount.getNumArguments()
+             << llvm::range_size(explicitArgs)
              << " arguments but dispatch provides " << workload.size();
     }
-    for (auto [idx, expectedType, actualType] : llvm::enumerate(
-             workgroupCount.getArgumentTypes(), workload.getTypes())) {
+    for (auto [idx, expectedType, actualType] :
+         llvm::enumerate(explicitArgs, workload.getTypes())) {
       if (expectedType != actualType) {
         return op->emitOpError()
                << "workload operand " << idx << " type mismatch; expected "
@@ -3104,8 +3109,10 @@
       this->getOperation()->getParentOfType<IREE::Stream::ExecutableOp>();
   if (!executableOp)
     return {};
-  return executableOp.getInnerModule().lookupSymbol<::mlir::func::FuncOp>(
-      getFunctionRef());
+  auto innerModuleOp = executableOp.getInnerModule();
+  if (!innerModuleOp)
+    return {};
+  return innerModuleOp.lookupSymbol<::mlir::func::FuncOp>(getFunctionRef());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td
index d49ad4a..7f3ed5b 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td
@@ -3591,7 +3591,7 @@
     SymbolNameAttr:$sym_name
   );
 
-  let regions = (region SizedRegion<1>:$body);
+  let regions = (region AnyRegion:$body);
 
   let assemblyFormat = [{
     custom<SymbolVisibility>($sym_visibility)
@@ -3608,7 +3608,9 @@
   let extraClassDeclaration = [{
     Block& getBlock() { return getBody().front(); }
     ::mlir::ModuleOp getInnerModule() {
-      return *getBlock().getOps<::mlir::ModuleOp>().begin();
+      auto it = getBlock().getOps<::mlir::ModuleOp>();
+      if (it.empty()) return {};
+      return *it.begin();
     }
   }];
 
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/AnnotateDispatchArguments.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/AnnotateDispatchArguments.cpp
index 3d9a859..2e5d5a9 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/AnnotateDispatchArguments.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/AnnotateDispatchArguments.cpp
@@ -481,6 +481,8 @@
   // Operands/resources on the func are in an arbitrary order; get maps that
   // lets us go from dispatch site operand/resource to function argument.
   auto funcOp = exportOp.lookupFunctionRef();
+  if (!funcOp)
+    return;
   auto operandToArgMap =
       IREE::Stream::CmdDispatchOp::makeOperandToArgMap(funcOp);
   auto resourceToArgMap =
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/FoldUniformOperands.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/FoldUniformOperands.cpp
index 365f847..b799352 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/FoldUniformOperands.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/FoldUniformOperands.cpp
@@ -284,6 +284,8 @@
     // Optimize each dispatch op.
     for (auto executableOp :
          getOperation().getBodyRegion().getOps<IREE::Stream::ExecutableOp>()) {
+      if (!executableOp.getInnerModule())
+        continue;
       for (auto exportOp :
            executableOp.getOps<IREE::Stream::ExecutableExportOp>()) {
         auto &dispatchOps = entryDispatchMap[exportOp];
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/FuseDispatchBindings.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/FuseDispatchBindings.cpp
index 667abd7..f00e056 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/FuseDispatchBindings.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/FuseDispatchBindings.cpp
@@ -450,6 +450,8 @@
     MemoizedCmdZeros memoizedZeros;
     for (auto executableOp :
          getOperation().getBodyRegion().getOps<IREE::Stream::ExecutableOp>()) {
+      if (!executableOp.getInnerModule())
+        continue;
       for (auto exportOp :
            executableOp.getOps<IREE::Stream::ExecutableExportOp>()) {
         fuseDispatchBindings(executableOp, exportOp, entryDispatchMap[exportOp],
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/PackDispatchOperands.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/PackDispatchOperands.cpp
index e83aa08..baa6f65 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/PackDispatchOperands.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/PackDispatchOperands.cpp
@@ -298,8 +298,10 @@
     // Convert all public function signatures and manipulate the arguments.
     for (auto executableOp :
          getOperation().getOps<IREE::Stream::ExecutableOp>()) {
-      for (auto funcOp :
-           executableOp.getInnerModule().getOps<mlir::func::FuncOp>()) {
+      auto innerModuleOp = executableOp.getInnerModule();
+      if (!innerModuleOp)
+        continue;
+      for (auto funcOp : innerModuleOp.getOps<mlir::func::FuncOp>()) {
         if (funcOp.isPublic()) {
           updateExportFuncOp(funcOp);
         }
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/annotate_dispatch_arguments.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/annotate_dispatch_arguments.mlir
index 5024415..c43d679 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/annotate_dispatch_arguments.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/annotate_dispatch_arguments.mlir
@@ -1,5 +1,27 @@
 // RUN: iree-opt --split-input-file --iree-stream-annotate-dispatch-arguments %s | FileCheck %s
 
+// Tests that external executables don't get annotated
+
+// CHECK-LABEL: @skipExternExecutablesEx
+stream.executable private @skipExternExecutablesEx {
+  // CHECK: stream.executable.export public @dispatch
+  stream.executable.export public @dispatch
+}
+func.func @skipExternExecutables(%arg0: i32) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c0_i32 = arith.constant 0 : i32
+  %alloc = stream.resource.alloc uninitialized : !stream.resource<transient>{%c1}
+  %result_timepoint = stream.cmd.execute with(%alloc as %capture: !stream.resource<transient>{%c1}) {
+    stream.cmd.dispatch @annotatePotentialValuesEx::@dispatch[%c1, %c1, %c1](%c0_i32 : i32) {
+      rw %capture[%c0 for %c1] : !stream.resource<transient>{%c1}
+    }
+  } => !stream.timepoint
+  return
+}
+
+// -----
+
 // Tests that operands are annotated with their potential values.
 // %arg0: cannot be annotated because it comes from outside the program.
 // %arg1: all values known, gets alignment being an index.
diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/UtilTraits.h b/compiler/src/iree/compiler/Dialect/Util/IR/UtilTraits.h
index 4319164..07aa445 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/UtilTraits.h
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/UtilTraits.h
@@ -37,6 +37,12 @@
   static LogicalResult verifyTrait(Operation *op) { return success(); }
 };
 
+template <typename ConcreteType>
+struct ImplicitlyCaptured
+    : public OpTrait::TraitBase<ConcreteType, ImplicitlyCaptured> {
+  static LogicalResult verifyTrait(Operation *op) { return success(); }
+};
+
 } // namespace Util
 } // namespace IREE
 } // namespace OpTrait
diff --git a/samples/custom_dispatch/vulkan/shaders/CMakeLists.txt b/samples/custom_dispatch/vulkan/shaders/CMakeLists.txt
index 7cbaae0..ce15498 100644
--- a/samples/custom_dispatch/vulkan/shaders/CMakeLists.txt
+++ b/samples/custom_dispatch/vulkan/shaders/CMakeLists.txt
@@ -13,7 +13,7 @@
 # custom shaders in .spv format and not supporting infrastructure for compiling
 # shaders from various textual input languages (HLSL/etc). Users are expected to
 # bring their own infrastructure if they want to bring their own source code.
-find_program(GLSLC glslc)
+find_program(GLSLC glslc HINTS "D:\\Tools\\VulkanSDK\\1.3.261.1\\Bin\\glslc.exe")
 if(NOT GLSLC)
   message(STATUS "IREE custom_dispatch/vulkan/shaders ignored -- glslc not found")
   return()
@@ -49,6 +49,7 @@
     example
   SRCS
     "example.mlir"
+    "example_inline.mlir"
   DATA
     ${_SPV_TARGET}
   TOOLS
diff --git a/samples/custom_dispatch/vulkan/shaders/example_inline.mlir b/samples/custom_dispatch/vulkan/shaders/example_inline.mlir
new file mode 100644
index 0000000..b213bd3
--- /dev/null
+++ b/samples/custom_dispatch/vulkan/shaders/example_inline.mlir
@@ -0,0 +1,116 @@
+// RUN: iree-compile %s \
+// RUN:     --iree-hal-executable-object-search-path=$IREE_BINARY_DIR | \
+// RUN: iree-run-module \
+// RUN:     --device=vulkan \
+// RUN:     --module=- \
+// RUN:     --function=mixed_invocation \
+// RUN:     --input=8xf32=2 \
+// RUN:     --input=8xf32=4 | \
+// RUN: FileCheck %s
+
+// The configuration used for executable compilation.
+// This lets the compiler and runtime know the format and requirements of the
+// executable binaries produced and multiple variants with differing formats
+// and compilation options (architectures, etc) can be embedded for runtime
+// selection.
+#spirv_target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
+  spirv.target_env = #spirv.target_env<
+    #spirv.vce<v1.3, [Shader, GroupNonUniform], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers]>,
+    #spirv.resource_limits<max_compute_workgroup_size = [128, 128, 64], subgroup_size = 64>
+  >
+}>
+
+// The target devices that the program will run on.
+// These can come from compiler flags and multiple targets can be supported
+// It's possible, for example, to support targeting multiple devices in the same
+// compiled binary.
+#vulkan_target = #hal.device.target<"vulkan", {
+  executable_targets = [#spirv_target],
+  // HACK: Vulkan target currently uses the legacy synchronous execution model.
+  legacy_sync
+}>
+
+module @example attributes {hal.device.targets = [#vulkan_target]} {
+
+  // Function demonstrating a few hand-authored dispatches mixed with codegen.
+  // Invoke with:
+  //  --device=vulkan
+  //  --function=mixed_invocation
+  //  --input=8xf32=2
+  //  --input=8xf32=4
+  // CHECK-LABEL: EXEC @mixed_invocation
+  func.func @mixed_invocation(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
+    // HACK: for hand-authored shaders all primitive values passed in need to
+    // be i32 or a bit-castable type. This is because ABI packing of other types
+    // happens inside of the PackDispatchOperandsPass that is currently not
+    // usable with external functions as it changes the ABI. In the future we
+    // can better define the ABI such that it's possible to match the compiler
+    // expectations around padding/alignment. For now users must do the packing
+    // themselves (splitting i64 into i32+i32, etc).
+    %c0 = arith.constant 0 : index
+    %dim = tensor.dim %arg0, %c0 : tensor<?xf32>
+    %dim_i32 = arith.index_cast %dim : index to i32
+
+    // Dispatch a basic `ret = lhs * rhs` shader.
+    %0 = hal.dispatch.extern "main"[%dim](%dim_i32, %arg0, %arg1) : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> tensor<?xf32>{%dim}
+      // The layout defines the required bindings and push constants and can be
+      // thought of as the function signature.
+      layout(#hal.pipeline.layout<push_constants = 1, sets = [
+        <0, bindings = [
+            <0, storage_buffer, ReadOnly>,
+            <1, storage_buffer, ReadOnly>,
+            <2, storage_buffer>
+        ]>
+      ]>)
+      // Bindings are automatically inferred when possible as part of the ABI
+      // but can be overridden if the user wants to use features such as sparse
+      // bindings or multiple descriptor sets. To do so the
+      // `hal.interface.bindings` attribute can be added to a dispatch op as
+      // follows mapping tensor operands/results to the pipeline layout
+      // sets/bindings:
+      bindings([
+        #hal.interface.binding<0, 0>,
+        #hal.interface.binding<0, 1>,
+        #hal.interface.binding<0, 2>
+      ])
+      // Object files linked into the executable.
+      // Certain backends (today) support either wholesale definition or linking
+      // of partial objects for imports used by generated code. Each compilation
+      // target can have its own unique set of objects to link in and the target
+      // keys can be generic. This allows for an object file to be linked in based
+      // only on the target triple while allowing for more specialized ones
+      // requiring certain CPU features to be only included when building those.
+      objects(#hal.executable.objects<{
+        #spirv_target = [
+          #hal.executable.object<{
+            // Referencing a file path on disk but could also have the data
+            // embedded in order to make the MLIR file hermetic/portable across
+            // compilation pipelines. In the future we'll likely use MLIR's
+            // external resource functionality for this. By allowing for the
+            // objects to be embedded we can support JIT scenarios where some
+            // layer higher or lower may be emitting the objects to link in as
+            // part of the overall compilation.
+            path = "samples/custom_dispatch/vulkan/shaders/simple_mul.spv"
+          }>
+        ]
+      }>)
+      count(%device: !hal.device, %workload: index) -> (index, index, index) {
+        // This host function is used to compute the XYZ workgroup count
+        // dispatched at runtime. It can query the %device for capabilities
+        // and limits (shared memory size, etc). The other arguments are the
+        // values passed in the dispatch operation (usually things like root
+        // output op tensor dimensions and other abstract values).
+        %x = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%workload]
+        %c1 = arith.constant 1 : index
+        hal.return %x, %c1, %c1 : index, index, index
+      }
+
+    // Code gen some other ops - these will interleave with the hand-authored
+    // ones but naturally won't be able to fuse with them.
+    %1 = arith.addf %0, %arg1 : tensor<?xf32>
+
+    // CHECK: 8xf32=12 12 12 12 12 12 12 12
+    return %1 : tensor<?xf32>
+  }
+
+}  // module