Fixing issues found when enabling indirect command buffers. (#18382)

This is in preparation for making
`--iree-hal-indirect-command-buffers=true` the default as part of
#17875.

Most of the fixes required were related to analysis failures that
required stricter handling of duplicate call graph traversal and
care around when initializers are combined (which still needs
improvement but at a general level not related to this work). A TODO was
fixed for supporting `stream.cmd.call` in reusable command buffers by
passing binding table ordinals along with buffers to the
`stream.cmd.func` ops once lowered into the HAL dialect - there are no
users of this functionality today but now it won't be a special case.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index f475657..1a5bc24 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -3037,13 +3037,13 @@
     return failure();
   Operation *rootOperation = rootOp.value();
 
-  LLVM_DEBUG(KD_DBGS() << "Root op: " << *rootOperation << "\n");
-
   // Handle the case with no known root operation.
   if (!rootOperation) {
     return lowerUsingDefaultPipeline(entryPointFn);
   }
 
+  LLVM_DEBUG(KD_DBGS() << "Root op: " << *rootOperation << "\n");
+
   auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(entryPointFn);
   auto targetMLTransInfo =
       TargetMLTransformInfo::getTargetMLTransformInfo(targetAttr);
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/BUILD.bazel b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/BUILD.bazel
index 28ef891..5c64fa1 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/BUILD.bazel
@@ -22,6 +22,7 @@
     ],
     deps = [
         ":Utils",
+        "//compiler/src/iree/compiler/Dialect/HAL/Analysis",
         "//compiler/src/iree/compiler/Dialect/HAL/Conversion",
         "//compiler/src/iree/compiler/Dialect/HAL/IR",
         "//compiler/src/iree/compiler/Dialect/HAL/IR:HALDialect",
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/CMakeLists.txt
index 909e713..92dc1b9 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/CMakeLists.txt
@@ -28,6 +28,7 @@
     MLIRSCFDialect
     MLIRTransformUtils
     MLIRTransforms
+    iree::compiler::Dialect::HAL::Analysis
     iree::compiler::Dialect::HAL::Conversion
     iree::compiler::Dialect::HAL::IR
     iree::compiler::Dialect::HAL::IR::HALDialect
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.cpp b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.cpp
index 8270b21..c391aef 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.cpp
@@ -6,6 +6,7 @@
 
 #include "iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.h"
 
+#include "iree/compiler/Dialect/HAL/Analysis/Captures.h"
 #include "iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Utils.h"
 #include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
@@ -18,6 +19,7 @@
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/RegionUtils.h"
 
 namespace mlir::iree_compiler {
 
@@ -803,25 +805,44 @@
   LogicalResult
   matchAndRewrite(IREE::Stream::CmdFuncOp funcOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    SmallVector<DictionaryAttr> oldArgAttrs;
+    funcOp.getAllArgAttrs(oldArgAttrs);
     SmallVector<Type> newArgTypes;
-    SmallVector<DictionaryAttr> newArgAttrs;
+    SmallVector<Attribute> newArgAttrs;
     newArgTypes.push_back(rewriter.getType<IREE::HAL::CommandBufferType>());
     newArgAttrs.push_back(rewriter.getDictionaryAttr({})); // command buffer
-    funcOp.getAllArgAttrs(newArgAttrs);
+    for (auto [i, oldType] : llvm::enumerate(funcOp.getArgumentTypes())) {
+      if (isa<IREE::Stream::ResourceType>(oldType)) {
+        // Resource converted into a (binding ordinal, buffer) pair.
+        newArgTypes.push_back(rewriter.getIndexType());
+        newArgAttrs.push_back(rewriter.getDictionaryAttr({}));
+        newArgTypes.push_back(rewriter.getType<IREE::HAL::BufferType>());
+        newArgAttrs.push_back(oldArgAttrs[i]);
+      } else {
+        // Primitive/other pass-through.
+        // Support expansion by preserving the arg attr on the first expanded
+        // type and filling in empty attrs for the remainder.
+        size_t oldCount = newArgTypes.size();
+        if (failed(getTypeConverter()->convertType(oldType, newArgTypes))) {
+          return rewriter.notifyMatchFailure(funcOp,
+                                             "failed to convert arg types");
+        }
+        size_t typeCount = newArgTypes.size() - oldCount;
+        newArgAttrs.push_back(oldArgAttrs[i]);
+        newArgAttrs.append(typeCount - 1, rewriter.getDictionaryAttr({}));
+      }
+    }
     SmallVector<Type> newResultTypes;
-    if (failed(getTypeConverter()->convertTypes(funcOp.getArgumentTypes(),
-                                                newArgTypes)) ||
-        failed(getTypeConverter()->convertTypes(funcOp.getResultTypes(),
+    if (failed(getTypeConverter()->convertTypes(funcOp.getResultTypes(),
                                                 newResultTypes))) {
-      return rewriter.notifyMatchFailure(funcOp, "failed to convert types");
+      return rewriter.notifyMatchFailure(funcOp,
+                                         "failed to convert result types");
     }
     auto newOp = rewriter.replaceOpWithNewOp<IREE::Util::FuncOp>(
         funcOp, funcOp.getNameAttr(),
         rewriter.getFunctionType(newArgTypes, newResultTypes),
         /*tied_operands=*/ArrayAttr{}, funcOp.getSymVisibilityAttr(),
-        rewriter.getArrayAttr(
-            ArrayRef<Attribute>(newArgAttrs.data(), newArgAttrs.size())),
-        funcOp.getAllResultAttrs(),
+        rewriter.getArrayAttr(newArgAttrs), funcOp.getAllResultAttrs(),
         /*inlining_policy=*/IREE::Util::InliningPolicyAttrInterface{});
     newOp->setDialectAttrs(funcOp->getDialectAttrs());
     return success();
@@ -836,6 +857,23 @@
                   ConversionPatternRewriter &rewriter) const override {
     auto commandBufferMapping = mapping->lookupCommandBufferFor(callOp);
 
+    // Memoized dummy values.
+    Value zeroIndex;
+    auto getZeroIndex = [&]() {
+      if (!zeroIndex) {
+        zeroIndex = rewriter.create<arith::ConstantIndexOp>(callOp.getLoc(), 0);
+      }
+      return zeroIndex;
+    };
+    Value nullBuffer;
+    auto getNullBuffer = [&]() {
+      if (!nullBuffer) {
+        nullBuffer = rewriter.create<IREE::Util::NullOp>(
+            callOp.getLoc(), rewriter.getType<IREE::HAL::BufferType>());
+      }
+      return nullBuffer;
+    };
+
     // Always pass the command buffer as the first arg.
     SmallVector<Value> operands;
     operands.push_back(commandBufferMapping.getHandle());
@@ -843,10 +881,20 @@
     for (auto [originalOperand, convertedOperand] : llvm::zip_equal(
              callOp.getResourceOperands(), adaptor.getResourceOperands())) {
       if (llvm::isa<IREE::Stream::ResourceType>(originalOperand.getType())) {
-        // Resource type, add offset/length.
-        operands.push_back(convertedOperand);
-        operands.push_back(adaptor.getResourceOperandOffsets()[resourceIndex]);
-        operands.push_back(adaptor.getResourceOperandLengths()[resourceIndex]);
+        // Resource type, pass binding index or buffer and offset/length.
+        auto binding = commandBufferMapping.resolveBinding(
+            callOp.getLoc(), originalOperand, convertedOperand,
+            adaptor.getResourceOperandOffsets()[resourceIndex],
+            adaptor.getResourceOperandLengths()[resourceIndex], rewriter);
+        if (binding.buffer.getType().isIndex()) {
+          operands.push_back(binding.buffer);
+          operands.push_back(getNullBuffer());
+        } else {
+          operands.push_back(getZeroIndex());
+          operands.push_back(binding.buffer);
+        }
+        operands.push_back(binding.byteOffset);
+        operands.push_back(binding.byteLength);
         ++resourceIndex;
       } else {
         // Primitive/custom type.
@@ -872,6 +920,43 @@
   }
 };
 
+// Returns true if any primitive uniform value (i32, index, etc) captured within
+// |op| (but not _by_ op) is a dynamic value (mutable global, calculated, etc).
+// Returns false if all values are derived from constants or immutable globals.
+static bool regionCapturesDynamicUniformValues(Operation *op) {
+  auto isDynamicUniform = [](Value value) {
+    if (value.getType().isIntOrIndexOrFloat()) {
+      switch (IREE::HAL::categorizeValue(value)) {
+      default:
+      case IREE::HAL::ValueOrigin::Unknown:
+      case IREE::HAL::ValueOrigin::MutableGlobal:
+        return true;
+      case IREE::HAL::ValueOrigin::LocalConstant:
+      case IREE::HAL::ValueOrigin::ImmutableGlobal:
+        return false;
+      }
+    }
+    return false;
+  };
+  for (auto operand : op->getOperands()) {
+    if (isDynamicUniform(operand)) {
+      // Today this usually indicates a dynamic buffer size. We could perform
+      // some tricks to adjust the size based on usage instead of requiring that
+      // this size match however it's safer to treat dynamically sized buffers
+      // as fully dynamic for now.
+      return true;
+    }
+  }
+  SetVector<Value> capturedValues;
+  mlir::getUsedValuesDefinedAbove(op->getRegions(), capturedValues);
+  for (auto capturedValue : capturedValues) {
+    if (isDynamicUniform(capturedValue)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 static void insertSerializationBarriers(Location loc, Block &block,
                                         Value commandBuffer,
                                         OpBuilder builder) {
@@ -911,17 +996,25 @@
     auto [device, queueAffinity] =
         lookupDeviceAndQueueAffinityFor(executeOp, rewriter);
 
+    // Until uniform buffers are implemented we can't reuse command buffers that
+    // contain non-constant uniform values (i32, index, etc). We'll have a pass
+    // that runs prior to conversion that creates new stream resources and
+    // changes dispatches to use them for any dispatch we can - note that there
+    // may still be some that slip through due to custom executables.
+    const bool capturesDynamicUniformValues =
+        regionCapturesDynamicUniformValues(executeOp);
+
     // Calculate the indirect buffer references used within the command buffer
     // by analyzing captured resources. This analysis will be used by subsequent
     // conversion to decide between embedding the direct buffer references or
     // indirect ones. We only do this if the execution region is reused.
     IndexSet indexSet(loc, rewriter);
     BindingTable bindingTable;
-    if (!executeOp.getOnce() && clIndirectCommandBuffers) {
+    if (!executeOp.getOnce() && !capturesDynamicUniformValues &&
+        clIndirectCommandBuffers) {
       bindingTable = BindingTable(executeOp, adaptor.getResourceOperands(),
                                   adaptor.getResourceOperandSizes(), indexSet);
     }
-    auto bindingTableValues = llvm::to_vector(bindingTable.getValues());
 
     // If the execute op is one-shot or there's no indirect bindings then mark
     // the command buffer one-shot.
@@ -933,8 +1026,12 @@
         modes =
             modes | IREE::HAL::CommandBufferModeBitfield::AllowInlineExecution;
       }
+      bindingTable = {};
     }
 
+    // Cache the binding table values for use with the indirect execute.
+    auto bindingTableValues = llvm::to_vector(bindingTable.getValues());
+
     // Derive the command buffer type based on the kind of operations present.
     // This can help the submission get routed to appropriate hardware queues
     // (like dedicated DMA controllers).
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Utils.cpp b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Utils.cpp
index c5df454..952681e 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Utils.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Utils.cpp
@@ -251,15 +251,6 @@
   auto resourceValues = executeOp.getResourceOperands();
   auto capturedValues = executeOp.getBody().getArguments();
 
-  // TODO(benvanik): support stream.cmd.call with indirect bindings; today the
-  // simple analysis that happens here can't handle generating binding tables
-  // for them as the target would need to know if it's taking a buffer or a
-  // binding table slot. A `variant<!hal.buffer, i32>` may let us do that.
-  executeOp->walk([&](IREE::Stream::CmdCallOp) { hasUnsupportedOps = true; });
-  if (hasUnsupportedOps) {
-    return;
-  }
-
   // Categorize each resource value and add it to the table.
   for (auto [resourceValue, capturedValue, bufferValue, bufferSize] :
        llvm::zip_equal(resourceValues, capturedValues, bufferValues,
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Utils.h b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Utils.h
index 50f0b0d..7fc80d8 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Utils.h
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Utils.h
@@ -75,7 +75,7 @@
                ValueRange bufferSizes, IndexSet &indexSet);
 
   // True if binding tables are supported for the consumer.
-  bool isSupported() const { return !hasUnsupportedOps; }
+  bool isSupported() const { return true; }
 
   // True if the binding table is empty.
   bool empty() const { return indirectBuffers.empty(); }
@@ -91,8 +91,6 @@
   std::optional<Value> lookupResourceSlot(Value resourceValue);
 
 private:
-  // True if any ops are nested that may prevent binding table usage.
-  bool hasUnsupportedOps = false;
   // Buffer binding table with <buffer, offset, length>.
   SmallVector<IREE::HAL::BindingValue> indirectBuffers;
   // A mapping of resources to binding table slot ordinals.
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/cmd_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/cmd_ops.mlir
index ece8202..571f729 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/cmd_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/cmd_ops.mlir
@@ -218,8 +218,8 @@
 util.global private @constant_size : index
 
 // CHECK-LABEL: @cmdDispatch
-//  CHECK-SAME: (%[[ARG_RESOURCE:.+]]: !hal.buffer, %[[ARG_SIZE:.+]]: index)
-util.func public @cmdDispatch(%arg_resource: !stream.resource<external>, %arg_size: index) -> !stream.timepoint {
+//  CHECK-SAME: (%[[ARG_RESOURCE:.+]]: !hal.buffer)
+util.func public @cmdDispatch(%arg_resource: !stream.resource<external>) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
@@ -230,6 +230,8 @@
   // CHECK-DAG: %[[CONSTANT_RESOURCE:.+]] = util.global.load immutable @constant_resource
   %constant_resource = util.global.load immutable @constant_resource : !stream.resource<constant>
   %constant_size = util.global.load immutable @constant_size : index
+  // CHECK: %[[ARG_SIZE:.+]] = arith.constant 200
+  %arg_size = arith.constant 200 : index
   // CHECK-DAG: %[[DEVICE:.+]] = util.global.load immutable @device
   // CHECK: %[[MEMOIZED_CMD:.+]] = hal.device.memoize
   // CHECK: %[[CMD:.+]] = hal.command_buffer.create
@@ -288,28 +290,65 @@
 // -----
 
 // Tests conversion of streamable calls and function declarations.
-// Expect a command buffer and a buffer + offset + length for each resource.
+// Expect a command buffer and a (binding table ordinal, buffer) + offset +
+// length for each resource. Here we have one constant global that gets baked
+// into the memoized command buffer and the two arguments are treated as
+// indirect bindings.
 
 util.global private @device : !hal.device
 
-// CHECK: util.func private @cmdFunc(%arg0: !hal.command_buffer, %arg1: !hal.buffer, %arg2: index, %arg3: index, %arg4: i32, %arg5: !hal.buffer, %arg6: index, %arg7: index, %arg8: !custom.type, %arg9: !hal.buffer, %arg10: index, %arg11: index)
-stream.cmd.func private @cmdFunc(%arg0[%arg1 for %arg2]: !stream.resource<*>, %arg3: i32, %arg4[%arg5 for %arg6]: !stream.resource<*>, %arg7: !custom.type, %arg8[%arg9 for %arg10]: !stream.resource<*>)
+util.global private @global : !stream.resource<constant>
+
+// CHECK: util.func private @cmdFunc(
+// CHECK-SAME: %arg0: !hal.command_buffer,
+stream.cmd.func private @cmdFunc(
+    // CHECK-SAME: %arg1: index, %arg2: !hal.buffer, %arg3: index, %arg4: index,
+    %arg0[%arg1 for %arg2]: !stream.resource<*>,
+    // CHECK-SAME: %arg5: i32,
+    %arg3: i32,
+    // CHECK-SAME: %arg6: index, %arg7: !hal.buffer, %arg8: index, %arg9: index,
+    %arg4[%arg5 for %arg6]: !stream.resource<*>,
+    // CHECK-SAME: %arg10: !custom.type,
+    %arg7: !custom.type,
+    // CHECK-SAME: %arg11: index, %arg12: !hal.buffer, %arg13: index, %arg14: index
+    %arg8[%arg9 for %arg10]: !stream.resource<*>)
 
 // CHECK-LABEL: @cmdCall
-util.func public @cmdCall(%arg0: !stream.resource<external>, %arg1: i32, %arg2: !stream.resource<external>, %arg3: !custom.type, %arg4: !stream.resource<external>) -> !stream.timepoint {
+util.func public @cmdCall(
+    // CHECK-SAME: (%[[ARG_I32:.+]]: i32, %[[ARG_CUSTOM:.+]]: !custom.type,
+    %arg_i32: i32, %arg_custom: !custom.type,
+    // CHECK-SAME:  %[[ARG_RESOURCE0:.+]]: !hal.buffer, %[[ARG_RESOURCE1:.+]]: !hal.buffer)
+    %arg_resource0: !stream.resource<transient>, %arg_resource1: !stream.resource<external>) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
-  // CHECK-DAG: %[[SIZE0:.+]] = arith.constant 100
-  %size0 = arith.constant 100 : index
-  // CHECK-DAG: %[[SIZE1:.+]] = arith.constant 101
-  %size1 = arith.constant 101 : index
-  // CHECK-DAG: %[[SIZE2:.+]] = arith.constant 102
-  %size2 = arith.constant 102 : index
+  // CHECK-DAG: %[[GLOBAL_RESOURCE:.+]] = util.global.load immutable @global
+  %global_resource = util.global.load immutable @global : !stream.resource<constant>
+  // CHECK-DAG: %[[GLOBAL_SIZE:.+]] = arith.constant 100
+  %global_size = arith.constant 100 : index
+  // CHECK-DAG: %[[ARG_SIZE0:.+]] = arith.constant 101
+  %arg_size0 = arith.constant 101 : index
+  // CHECK-DAG: %[[ARG_SIZE1:.+]] = arith.constant 102
+  %arg_size1 = arith.constant 102 : index
+  // CHECK: hal.device.memoize
   // CHECK: %[[COMMAND_BUFFER:.+]] = hal.command_buffer.create
-  %timepoint = stream.cmd.execute on(#hal.device.affinity<@device>) with(%arg0 as %stream0: !stream.resource<external>{%size0}, %arg2 as %stream1: !stream.resource<external>{%size1}, %arg4 as %stream2: !stream.resource<external>{%size2}) {
-    // CHECK: util.call @cmdFunc(%[[COMMAND_BUFFER]], %arg0, %c0, %[[SIZE0]], %arg1, %arg2, %c0, %[[SIZE1]], %arg3, %arg4, %c0, %[[SIZE2]]) :
-    // CHECK-SAME: (!hal.command_buffer, !hal.buffer, index, index, i32, !hal.buffer, index, index, !custom.type, !hal.buffer, index, index) -> ()
-    stream.cmd.call @cmdFunc(ro %stream0[%c0 for %size0], %arg1, rw %stream1[%c0 for %size1], %arg3, wo %stream2[%c0 for %size2]) : (!stream.resource<external>{%size0}, i32, !stream.resource<external>{%size1}, !custom.type, !stream.resource<external>{%size2}) -> ()
+  %timepoint = stream.cmd.execute on(#hal.device.affinity<@device>)
+      with(%global_resource as %stream0: !stream.resource<constant>{%global_size}, %arg_resource0 as %stream1: !stream.resource<transient>{%arg_size0}, %arg_resource1 as %stream2: !stream.resource<external>{%arg_size1}) {
+    // CHECK-DAG: %[[NULL_BUFFER:.+]] = util.null : !hal.buffer
+    // CHECK: util.call @cmdFunc(%[[COMMAND_BUFFER]],
+    stream.cmd.call @cmdFunc(
+        // CHECK-SAME: %c0, %[[GLOBAL_RESOURCE]], %c0, %[[GLOBAL_SIZE]], %[[ARG_I32]],
+        ro %stream0[%c0 for %global_size], %arg_i32,
+        // CHECK-SAME: %c0, %[[NULL_BUFFER]], %c0, %[[ARG_SIZE0]], %[[ARG_CUSTOM]],
+        rw %stream1[%c0 for %arg_size0], %arg_custom,
+        // CHECK-SAME: %c1, %[[NULL_BUFFER]], %c0, %[[ARG_SIZE1]]
+        wo %stream2[%c0 for %arg_size1]) :
+        // CHECK-SAME: (!hal.command_buffer, index, !hal.buffer, index, index, i32, index, !hal.buffer, index, index, !custom.type, index, !hal.buffer, index, index) -> ()
+        (!stream.resource<constant>{%global_size}, i32, !stream.resource<transient>{%arg_size0}, !custom.type, !stream.resource<external>{%arg_size1}) -> ()
   } => !stream.timepoint
+  // CHECK: hal.device.queue.execute.indirect
+  // CHECK-SAME: bindings([
+  // CHECK-NEXT:   (%[[ARG_RESOURCE0]] : !hal.buffer)[%c0, %[[ARG_SIZE0]]],
+  // CHECK-NEXT:   (%[[ARG_RESOURCE1]] : !hal.buffer)[%c0, %[[ARG_SIZE1]]]
+  // CHECK-NEXT: ])
   util.return %timepoint : !stream.timepoint
 }
 
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp
index 4485011..d06c2dc 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp
@@ -48,10 +48,10 @@
 
 //===----------------------------------------------------------------------===//
 // custom<PipelineBindings>($binding_ordinals,
-//                               $binding_buffers,
-//                               type($binding_buffers),
-//                               $binding_offsets,
-//                               $binding_lengths)
+//                          $binding_buffers,
+//                          type($binding_buffers),
+//                          $binding_offsets,
+//                          $binding_lengths)
 //===----------------------------------------------------------------------===//
 
 static ParseResult parsePipelineBindings(
@@ -240,8 +240,9 @@
 static LogicalResult verifyTargetConditionRegion(Operation *op,
                                                  Region &region) {
   // Ignore if empty.
-  if (region.empty())
+  if (region.empty()) {
     return success();
+  }
 
   // Verify region takes a !hal.device.
   if (region.getNumArguments() != 1 ||
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/Passes.cpp b/compiler/src/iree/compiler/Dialect/HAL/Transforms/Passes.cpp
index 863c939..86e1c81 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/Passes.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/Passes.cpp
@@ -507,10 +507,6 @@
   // with them in their original target specification.
   passManager.addPass(IREE::HAL::createInitializeDevicesPass({targetRegistry}));
 
-  // Combine the initializers we emitted during resource cache
-  // materialization.
-  passManager.addPass(IREE::Util::createCombineInitializersPass());
-
   // TODO: Maybe this should be a part of Affine lowering pass.
   // Remove if it is added there.
   // https://github.com/llvm/llvm-project/issues/78458
diff --git a/compiler/src/iree/compiler/Dialect/Util/Analysis/Explorer.cpp b/compiler/src/iree/compiler/Dialect/Util/Analysis/Explorer.cpp
index bb46c83..745f029 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Analysis/Explorer.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Analysis/Explorer.cpp
@@ -57,8 +57,16 @@
 
   // Explicit op actions override all behavior.
   auto opIt = opActions.find(name);
-  if (opIt != opActions.end())
+  if (opIt != opActions.end()) {
     return opIt->second;
+  }
+
+  // Contents of object-like ops are ignored by default.
+  if (op->hasTrait<OpTrait::SymbolTable>()) {
+    LLVM_DEBUG(llvm::dbgs() << "  -- skipping contents of object-like op "
+                            << op->getName() << "\n");
+    return TraversalAction::SHALLOW;
+  }
 
   // Dialect actions let us carve out entire dialects and override interfaces
   // that may otherwise pick up ops.
@@ -73,14 +81,16 @@
     return TraversalAction::IGNORE;
   }
   auto dialectIt = dialectActions.find(dialect->getNamespace());
-  if (dialectIt != dialectActions.end())
+  if (dialectIt != dialectActions.end()) {
     return dialectIt->second;
+  }
 
   // Slow path for interfaces as there's no way to enumerate the interfaces an
   // op has registered (AFAICT).
   for (auto [interfaceId, action] : interfaceActions) {
-    if (name.hasInterface(interfaceId))
+    if (name.hasInterface(interfaceId)) {
       return action;
+    }
   }
 
   return defaultAction;
@@ -139,18 +149,24 @@
 // the same calculation over again. Maybe there's a way to use all that
 // GraphTraits goo to do this, but I don't know it.
 void Explorer::initializeInverseCallGraph() {
-  rootOp->walk([&](CallOpInterface callOp) {
-    if (callOp.getCallableForCallee().is<Value>()) {
-      // Indirect calls can't be tracked in the call graph, so ensure we mark
-      // the incomplete flag so that any call graph queries return
-      // TraversalResult::INCOMPLETE.
-      isCallGraphIncomplete = true;
-    } else {
-      auto *node = callGraph.resolveCallable(callOp, symbolTables);
-      if (!node->isExternal()) {
-        callGraphInv[node->getCallableRegion()].push_back(callOp);
+  forEachFunctionLikeOp([&](FunctionOpInterface parentOp) {
+    parentOp->walk([&](CallOpInterface callOp) {
+      if (callOp.getCallableForCallee().is<Value>()) {
+        // Indirect calls can't be tracked in the call graph, so ensure we mark
+        // the incomplete flag so that any call graph queries return
+        // TraversalResult::INCOMPLETE.
+        //
+        // TODO(benvanik): we should be keeping this finer-grained; today any
+        // indirect call invalidates all calls when really it should be for
+        // only those calls that are reachable via the indirect callee tree.
+        isCallGraphIncomplete = true;
+      } else {
+        auto *node = callGraph.resolveCallable(callOp, symbolTables);
+        if (!node->isExternal()) {
+          callGraphInv[node->getCallableRegion()].push_back(callOp);
+        }
       }
-    }
+    });
   });
 }
 
@@ -196,23 +212,36 @@
   for (auto &scc : llvm::make_range(llvm::scc_begin(&callGraph),
                                     llvm::scc_end(&callGraph))) {
     for (auto *node : scc) {
-      if (node->isExternal())
+      if (node->isExternal()) {
         continue;
+      }
       auto parentOp =
           node->getCallableRegion()->getParentOfType<FunctionOpInterface>();
-      if (parentOp && parentOp->getParentOp() == rootOp)
+      if (parentOp && parentOp->getParentOp() == rootOp) {
         fn(parentOp);
+      }
     }
   }
 }
 
 void Explorer::forEachFunctionLikeOp(
     std::function<void(FunctionOpInterface)> fn) {
-  forEachInitializer([=](IREE::Util::InitializerOpInterface op) {
-    if (auto funcOp = dyn_cast<FunctionOpInterface>(op.getOperation()))
+  // The call graph may not include initializers unless they make calls; we do
+  // initializers first and then walk the remainder of the call graph ignoring
+  // any initializers that may be in there.
+  DenseSet<FunctionOpInterface> visitedFuncOps;
+  forEachInitializer([&](IREE::Util::InitializerOpInterface op) {
+    if (auto funcOp = dyn_cast<FunctionOpInterface>(op.getOperation())) {
       fn(funcOp);
+      visitedFuncOps.insert(funcOp);
+    }
   });
-  forEachFunction(fn);
+  forEachFunction([&](FunctionOpInterface funcOp) {
+    if (!visitedFuncOps.contains(funcOp)) {
+      fn(funcOp);
+      visitedFuncOps.insert(funcOp);
+    }
+  });
 }
 
 bool Explorer::mayValuesAlias(Value a, Value b) {
@@ -472,6 +501,19 @@
             << "  !! traversal incomplete due to public function-like op @"
             << symbolOp.getName() << "\n";
       }
+      if (isCallGraphIncomplete) {
+        llvm::dbgs()
+            << "  !! traversal incomplete due to incomplete call graph for op @"
+            << symbolOp.getName() << "\n";
+      }
+    });
+  } else {
+    LLVM_DEBUG({
+      if (isCallGraphIncomplete) {
+        llvm::dbgs()
+            << "  !! traversal incomplete due to incomplete call graph for op"
+            << callableOp->getName() << "\n";
+      }
     });
   }
   return isPublic || isCallGraphIncomplete ? TraversalResult::INCOMPLETE
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/CombineInitializers.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/CombineInitializers.cpp
index 7f455f6..cf2d370 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/CombineInitializers.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/CombineInitializers.cpp
@@ -48,10 +48,10 @@
       return;
     auto fusedLoc = FusedLoc::get(&getContext(), locs);
 
-    // Make the new initializer op in the same location as the last initializer
+    // Make the new initializer op in the same location as the first initializer
     // we are combining - this ensures that module initialization order is
     // preserved.
-    OpBuilder builder(initializerOps.back());
+    OpBuilder builder(initializerOps.front());
     auto newOp = builder.create<IREE::Util::InitializerOp>(fusedLoc);
     builder.setInsertionPointToStart(newOp.addEntryBlock());
     InlinerInterface inlinerInterface(&getContext());
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/combine_initializers.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/combine_initializers.mlir
index df2ce82..e250e18 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/combine_initializers.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/combine_initializers.mlir
@@ -6,22 +6,6 @@
 
 // CHECK: util.global private mutable @global0 : index
 util.global private mutable @global0 : index
-util.initializer {
-  %value0 = util.call @extern() : () -> index
-  util.global.store %value0, @global0 : index
-  util.return
-}
-// CHECK-NEXT: util.global private @global1 : index
-util.global private @global1 : index
-// CHECK-NEXT: util.global private @global2 : index
-util.global private @global2 : index
-util.initializer {
-  %value1 = util.call @extern() : () -> index
-  util.global.store %value1, @global1 : index
-  %value2 = util.call @extern() : () -> index
-  util.global.store %value2, @global2 : index
-  util.return
-}
 // CHECK-NEXT: util.initializer {
 // CHECK-NEXT: %[[VALUE0:.+]] = util.call @extern()
 // CHECK-NEXT: util.global.store %[[VALUE0]], @global0
@@ -30,6 +14,23 @@
 // CHECK-NEXT: %[[VALUE2:.+]] = util.call @extern()
 // CHECK-NEXT: util.global.store %[[VALUE2]], @global2
 // CHECK-NEXT: util.return
+util.initializer {
+  %value0 = util.call @extern() : () -> index
+  util.global.store %value0, @global0 : index
+  util.return
+}
+// CHECK: util.global private @global1 : index
+util.global private @global1 : index
+// CHECK-NEXT: util.global private @global2 : index
+util.global private @global2 : index
+// CHECK-NOT: util.initializer
+util.initializer {
+  %value1 = util.call @extern() : () -> index
+  util.global.store %value1, @global1 : index
+  %value2 = util.call @extern() : () -> index
+  util.global.store %value2, @global2 : index
+  util.return
+}
 
 // CHECK-LABEL: @orderedCombining
 util.func @orderedCombining(%arg0: index) -> (index, index, index) {
@@ -47,6 +48,16 @@
 
 // CHECK: util.global private mutable @globalA : index
 util.global private mutable @globalA : index
+// CHECK: util.initializer {
+// CHECK: ^bb1:
+// CHECK:   cf.br ^bb3
+// CHECK: ^bb2:
+// CHECK:   cf.br ^bb3
+// CHECK: ^bb3:
+// CHECK:   cf.br ^bb4
+// CHECK: ^bb4:
+// CHECK:   util.return
+// CHECK: }
 util.initializer {
   %cond = arith.constant 1 : i1
   cf.cond_br %cond, ^bb1, ^bb2
@@ -63,18 +74,9 @@
 }
 // CHECK-NEXT: util.global private @globalB : index
 util.global private @globalB : index
+// CHECK-NOT: util.initializer
 util.initializer {
   %c300 = arith.constant 300 : index
   util.global.store %c300, @globalB : index
   util.return
 }
-// CHECK: util.initializer {
-// CHECK: ^bb1:
-// CHECK:   cf.br ^bb3
-// CHECK: ^bb2:
-// CHECK:   cf.br ^bb3
-// CHECK: ^bb3:
-// CHECK:   cf.br ^bb4
-// CHECK: ^bb4:
-// CHECK:   util.return
-// CHECK: }
diff --git a/compiler/src/iree/compiler/Dialect/VM/Transforms/Passes.cpp b/compiler/src/iree/compiler/Dialect/VM/Transforms/Passes.cpp
index 508ead4..05852aa 100644
--- a/compiler/src/iree/compiler/Dialect/VM/Transforms/Passes.cpp
+++ b/compiler/src/iree/compiler/Dialect/VM/Transforms/Passes.cpp
@@ -64,6 +64,10 @@
   passManager.addPass(mlir::createInlinerPass());
   passManager.addPass(mlir::createSymbolDCEPass());
 
+  // Combine the initializers for all globals to allow us to optimize them
+  // together.
+  passManager.addPass(IREE::Util::createCombineInitializersPass());
+
   FunctionLikeNest(passManager)
       .addPass(mlir::createSCFForLoopCanonicalizationPass);
 
diff --git a/runtime/src/iree/hal/utils/deferred_work_queue.c b/runtime/src/iree/hal/utils/deferred_work_queue.c
index e8ee7bf..22c229b 100644
--- a/runtime/src/iree/hal/utils/deferred_work_queue.c
+++ b/runtime/src/iree/hal/utils/deferred_work_queue.c
@@ -981,7 +981,6 @@
       iree_hal_command_buffer_mode_t mode =
           iree_hal_command_buffer_mode(command_buffer) |
           IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT |
-          IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION |
           // NOTE: we need to validate if a binding table is provided as the
           // bindings were not known when it was originally recorded.
           (iree_hal_buffer_binding_table_is_empty(binding_table)